def check_results(self, flow, checker):
        import os
        from metaflow import namespace

        run = checker.get_run()
        if run is None:
            # CliChecker does not return a run object, that's ok
            return
        flow_obj = run.parent
        # test crazy unicode and spaces in tags
        # these tags must be set with --tag option in contexts.json
        tags = (
            u"project:basic_tag",
            u"project_branch:user.tester",
            u"user:%s" % os.environ.get("METAFLOW_USER"),
            u"刺身 means sashimi",
            u"multiple tags should be ok",
        )
        for tag in tags:
            # test different namespaces: one is a system-tag,
            # another is a user tag
            namespace(tag)
            run = flow_obj[checker.run_id]
            # the flow object should not have tags
            assert_equals(frozenset(), frozenset(flow_obj.tags))
            # the run object should have the namespace tags
            assert_equals([True] * len(tags), [t in run.tags for t in tags])
            # filtering by a non-existent tag should return nothing
            assert_equals([], list(flow_obj.runs("not_a_tag")))
            # a conjunction of a non-existent tag and an existent tag
            # should return nothing
            assert_equals([], list(flow_obj.runs("not_a_tag", tag)))
            # all steps should be returned with tag filtering
            assert_equals(
                frozenset(step.name for step in flow),
                frozenset(step.id.split("/")[-1] for step in run.steps(tag)),
            )
            # a conjunction of two existent tags should return the original list
            assert_equals(
                frozenset(step.name for step in flow),
                frozenset(step.id.split("/")[-1] for step in run.steps(*tags)),
            )
            # all tasks should be returned with tag filtering
            for step in run:
                # the run object should have the tags
                assert_equals([True] * len(tags), [t in step.tags for t in tags])
                # filtering by a non-existent tag should return nothing
                assert_equals([], list(step.tasks("not_a_tag")))
                # filtering by the tag should not exclude any tasks
                assert_equals(
                    [task.id for task in step], [task.id for task in step.tasks(tag)]
                )
                for task in step.tasks(tag):
                    # the task object should have the tags
                    assert_equals([True] * len(tags), [t in task.tags for t in tags])
                    for data in task:
                        # the data artifact should have the tags
                        assert_equals(
                            [True] * len(tags), [t in data.tags for t in tags]
                        )
示例#2
0
def count_runs(flow_name=None):
    namespace(None)

    count_categories = get_week_times()

    def key_parser(run):
        return get_formatted_time(datetime.fromtimestamp(run.created_at))

    def stop_condition(run):
        a_week_ago = (datetime.now() - timedelta(days=7))
        flow_datetime = datetime.fromtimestamp(run.created_at)

        return flow_datetime.timestamp() < a_week_ago.timestamp()

    if flow_name is None:
        return_counts = MetaflowWrapper().get_count(count_categories,
                                                    key_parser, stop_condition)
    else:
        return_counts = FlowWrapper(flow_name=flow_name).get_count(
            count_categories, key_parser, stop_condition)

    return_data = []

    for key, value in return_counts.items():
        return_data.append({"time": key, "count": value})

    return {"data": return_data}
示例#3
0
def get_last_n(flow_name, n=5):
    namespace(None)

    flow = FlowWrapper(flow_name)

    runs = flow.get_most_recent_runs(n)

    return {"data": runs}
示例#4
0
def route_flows():

    namespace(None)

    m_wrapper = MetaflowWrapper()
    result = m_wrapper.get_formatted_flows()

    return {"data": result}
示例#5
0
def get_run_artifacts(flow_name=None, run_id=None):
    if flow_name is None or run_id is None:
        raise MetaflowException(400, "Invalid parameters")

    namespace(None)
    run = RunWrapper.create_from_lookup(flow_name, run_id)

    return {"data": run.get_run_output_data()}
示例#6
0
def get_run_data(flow=None, run_id=None):

    if flow is None or run_id is None:
        raise MetaflowException(400, "Invalid parameters")

    namespace(None)
    run = RunWrapper(f"{flow}/{run_id}")

    return {"data": run.get_formatted_steps()}
    def wrapper(*args):
        run = args[0]

        if run.cache:
            namespace(None)
            run._run = Run(run.pathspec)
            run.cache = False

        return func(run)
示例#8
0
def resolve_task_from_pathspec(flow_name, pathspec):
    """
    resolves a task object for the pathspec query on the CLI.
    Args:
        flow_name : (str) : name of flow
        pathspec (str) : can be `stepname` / `runid/stepname` / `runid/stepname/taskid`

    Returns:
        metaflow.Task | None
    """
    from metaflow import Flow, Step, Task
    from metaflow.exception import MetaflowNotFound

    # since pathspec can have many variations.
    pthsplits = pathspec.split("/")
    task = None
    run_id = None
    resolving_from = "task_pathspec"
    if len(pthsplits) == 1:
        # This means stepname
        resolving_from = "stepname"
        latest_run = Flow(flow_name).latest_run
        if latest_run is not None:
            run_id = latest_run.pathspec
            try:
                task = latest_run[pathspec].task
            except KeyError:
                pass
    elif len(pthsplits) == 2:
        # This means runid/stepname
        namespace(None)
        resolving_from = "step_pathspec"
        try:
            task = Step("/".join([flow_name, pathspec])).task
        except MetaflowNotFound:
            pass
    elif len(pthsplits) == 3:
        # this means runid/stepname/taskid
        namespace(None)
        resolving_from = "task_pathspec"
        try:
            task = Task("/".join([flow_name, pathspec]))
        except MetaflowNotFound:
            pass
    else:
        # raise exception for invalid pathspec format
        raise CommandException(
            msg=
            "The PATHSPEC argument should be of the form 'stepname' Or '<runid>/<stepname>' Or '<runid>/<stepname>/<taskid>'"
        )

    if task is None:
        # raise Exception that task could not be resolved for the query.
        raise TaskNotFoundException(pathspec, resolving_from, run_id=run_id)

    return task
示例#9
0
def get_most_recent(flow_name):
    namespace(None)

    flow = FlowWrapper(flow_name)

    last_run = flow.get_last_successful_run().json_with_steps()

    if last_run is None:
        return {}

    return {"data": last_run}
示例#10
0
def _get_client_run_obj(obj, run_id, user_namespace):
    flow_name = obj.flow.name

    # handle error messaging for two cases
    # 1. our user tries to tag a new flow before it is run
    # 2. our user makes a typo in --namespace
    try:
        namespace(user_namespace)
        Flow(pathspec=flow_name)
    except MetaflowNotFound:
        raise CommandException(
            "No run found for *%s*. Please run the flow before tagging." %
            flow_name)

    except MetaflowNamespaceMismatch:
        raise CommandException(
            "No run found for *%s* in namespace *%s*. You can switch the namespace using --namespace"
            % (flow_name, user_namespace))

    # throw an error with message to include latest run-id when run_id is None
    if run_id is None:
        latest_run_id = Flow(pathspec=flow_name).latest_run.id
        msg = ("Please specify a run-id using --run-id.\n"
               "*%s*'s latest run in namespace *%s* has id *%s*." %
               (flow_name, user_namespace, latest_run_id))
        raise CommandException(msg)
    run_id_parts = run_id.split("/")
    if len(run_id_parts) == 1:
        path_spec = "%s/%s" % (flow_name, run_id)
    else:
        raise CommandException("Run-id *%s* is not a valid run-id" % run_id)

    # handle error messaging for three cases
    # 1. our user makes a typo in --run-id
    # 2. our user's --run-id does not exist in the default/specified namespace
    try:
        namespace(user_namespace)
        run = Run(pathspec=path_spec)
    except MetaflowNotFound:
        raise CommandException("No run *%s* found for flow *%s*" %
                               (path_spec, flow_name))
    except MetaflowNamespaceMismatch:
        msg = "Run *%s* for flow *%s* does not belong to namespace *%s*\n" % (
            path_spec,
            flow_name,
            user_namespace,
        )
        raise CommandException(msg)
    return run
示例#11
0
def get_cards(task, type=None, follow_resumed=True):
    """
    Get cards related to a Metaflow `Task`

    Args:
        task (str or `Task`): A metaflow `Task` object or pathspec (flowname/runid/stepname/taskid)
        type (str, optional): The type of card to retrieve. Defaults to None.
        follow_resumed (bool, optional): If a Task has been resumed and cloned, then setting this flag will resolve the card for the origin task. Defaults to True.

    Returns:
        `CardContainer` : A `list` like object that holds `Card` objects.
    """
    from metaflow.client import Task
    from metaflow import namespace

    if isinstance(task, str):
        task_str = task
        if len(task_str.split("/")) != 4:
            # Exception that pathspec is not of correct form
            raise IncorrectPathspecException(task_str)
        # set namepsace as None so that we don't face namespace mismatch error.
        namespace(None)
        task = Task(task_str)
    elif not isinstance(task, Task):
        # Exception that the task argument should of form `Task` or `str`
        raise IncorrectArguementException(_TYPE(task))

    if follow_resumed:
        origin_taskpathspec = resumed_info(task)
        if origin_taskpathspec:
            task = Task(origin_taskpathspec)

    card_paths, card_ds = resolve_paths_from_task(
        _get_flow_datastore(task),
        pathspec=task.pathspec,
        type=type,
    )
    return CardContainer(
        card_paths,
        card_ds,
        from_resumed=origin_taskpathspec is not None,
        origin_pathspec=origin_taskpathspec,
    )
示例#12
0
def test_init_options(s3root, pathspecs, expected):
    [pathspec] = pathspecs
    flow_name, run_id = pathspec.split("/")
    plen = len(s3root)

    # option 1) s3root as prefix
    with S3(s3root=s3root) as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url[plen:])
            assert s3obj.key == url[plen:]
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://some/fake/address")

    # option 2) full url as s3root
    for url, exp in expected.items():
        with S3(s3root=url) as s3:
            s3obj = s3.get()
            assert_results([s3obj], {url: exp})

    # option 3) full urls
    with S3() as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url)
            assert s3obj.key == url
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("suffix")
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://nopath")
        with pytest.raises(MetaflowS3URLException):
            s3.get_many(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_recursive(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_all()

    # option 4) 'current' environment (fake a running flow)
    flow = FakeFlow(use_cli=False)

    parsed = urlparse(s3root)
    with pytest.raises(MetaflowS3URLException):
        # current not set yet, so this should fail
        with S3(run=flow):
            pass

    current._set_env(
        FakeFlow(name=flow_name),
        run_id,
        "no_step",
        "no_task",
        "no_origin_run_id",
        "no_ns",
        "no_user",
    )

    with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3:
        for url, exp in expected.items():
            name = url.split("/")[-1]
            s3obj = s3.get(name)
            assert s3obj.key == name
            assert_results([s3obj], {url: exp})
        names = [url.split("/")[-1] for url in expected]
        s3objs = s3.get_many(names)
        assert {e.key for e in s3objs} == set(names)
        assert_results(s3objs, expected)
        assert_results(s3.get_all(), expected, info_should_be_empty=True)

    # option 5) run object
    if DO_TEST_RUN:
        # Only works if a metadata service exists with the run in question.
        namespace(None)
        with S3(bucket=parsed.netloc, prefix=parsed.path,
                run=Run(pathspec)) as s3:
            names = [url.split("/")[-1] for url in expected]
            assert_results(s3.get_many(names), expected)
import metaflow as mf

mf.namespace(None)

m = mf.Metaflow()

assert m.metadata == mf.plugins.metadata.service.ServiceMetadataProvider, m.metadata
assert len(m.flows) > 0, m.flows
示例#14
0
def tag_list(
    obj,
    run_id,
    hide_system_tags,
    list_all,
    my_runs,
    group_by_tag,
    group_by_run,
    flat,
    arg_run_id,
):
    _set_current(obj)
    if run_id is None and arg_run_id is None and not list_all and not my_runs:
        # Assume list_all by default
        list_all = True

    if list_all and my_runs:
        raise CommandException(
            "Option --all cannot be used together with --my-runs.")

    if run_id is not None and arg_run_id is not None:
        raise CommandException(
            "Specify a run either using --run-id or as an argument but not both"
        )

    if arg_run_id is not None:
        run_id = arg_run_id

    if group_by_run and group_by_tag:
        raise CommandException(
            "Option --group-by-tag cannot be used with --group-by-run")

    if flat and (group_by_run or group_by_tag):
        raise CommandException(
            "Option --flat cannot be used with any --group-by-* option")

    system_tags_by_some_grouping = dict()
    all_tags_by_some_grouping = dict()

    def _populate_tag_groups_from_run(_run):
        if group_by_run:
            if hide_system_tags:
                all_tags_by_some_grouping[
                    _run.pathspec] = _run.tags - _run.system_tags
            else:
                system_tags_by_some_grouping[_run.pathspec] = _run.system_tags
                all_tags_by_some_grouping[_run.pathspec] = _run.tags
        elif group_by_tag:
            for t in _run.tags - _run.system_tags:
                all_tags_by_some_grouping.setdefault(t,
                                                     []).append(_run.pathspec)
            if not hide_system_tags:
                for t in _run.system_tags:
                    system_tags_by_some_grouping.setdefault(t, []).append(
                        _run.pathspec)
        else:
            if hide_system_tags:
                all_tags_by_some_grouping.setdefault("_", set()).update(
                    _run.tags.difference(_run.system_tags))
            else:
                system_tags_by_some_grouping.setdefault("_", set()).update(
                    _run.system_tags)
                all_tags_by_some_grouping.setdefault("_",
                                                     set()).update(_run.tags)

    pathspecs = []
    if list_all or my_runs:
        user_namespace = resolve_identity() if my_runs else None
        namespace(user_namespace)
        try:
            flow = Flow(pathspec=obj.flow.name)
        except MetaflowNotFound:
            raise CommandException(
                "Cannot list tags because the flow %s has never been run." %
                (obj.flow.name, ))
        for run in flow.runs():
            _populate_tag_groups_from_run(run)
            pathspecs.append(run.pathspec)
    else:
        run = _get_client_run_obj(obj, run_id, None)
        _populate_tag_groups_from_run(run)
        pathspecs.append(run.pathspec)

    if not group_by_run and not group_by_tag:
        # We list all the runs that match to print them out if needed.
        system_tags_by_some_grouping[",".join(
            pathspecs)] = system_tags_by_some_grouping.get("_", set())
        all_tags_by_some_grouping[",".join(
            pathspecs)] = all_tags_by_some_grouping.get("_", set())
        if "_" in system_tags_by_some_grouping:
            del system_tags_by_some_grouping["_"]
        if "_" in all_tags_by_some_grouping:
            del all_tags_by_some_grouping["_"]

    if flat:
        if len(all_tags_by_some_grouping) != 1:
            raise MetaflowInternalError("Failed to flatten tag set")
        for v in all_tags_by_some_grouping.values():
            for tag in v:
                obj.echo(tag)
            return

    _print_tags_for_runs_by_groups(obj, system_tags_by_some_grouping,
                                   all_tags_by_some_grouping, group_by_tag)
 def __init__(self):
     super().__init__()
     namespace(None)
     self._metaflow = Metaflow()
import hashlib
import json

from .client import CacheAction
from .utils import (cacheable_artifact_value, cacheable_exception_value,
                    progress_event_msg, artifact_cache_id,
                    unpack_pathspec_with_attempt_id, streamed_errors)
from services.ui_backend_service.data import unpack_processed_value
from services.ui_backend_service.api.utils import operators_to_filters

from metaflow import DataArtifact, namespace
namespace(None)  # Always use global namespace by default


class SearchArtifacts(CacheAction):
    """
    Fetches artifacts by pathspecs and performs a search against the object contents.
    Caches artifacts based on pathspec, and search results based on a combination of query&artifacts searched

    Parameters
    ----------
    pathspecs : List[str]
        A list of artifact pathspecs (with attempt id as last component)
            to fetch and match the search term against: ["FlowId/RunNumber/StepName/TaskId/ArtifactName/0"]
    searchterm : str
        A searchterm to match against the fetched S3 artifacts contents.

    Returns
    -------
    Dict or None
        example:
from metaflow import Flow, Metaflow, namespace
import boto3

# EVENTS_RECORD_STORE = os.environ['EVENTS_RECORD_STORE']
dynamodb = boto3.resource('dynamodb')


def parse_tags(self, tags, key_string):
    for tag in tags:
        if key_string in tag:
            return tag.split(":")[-1]
    return None


data = []
namespace(None)
for flow in Metaflow().flows:

    for run in flow.runs():

        for step in run.steps():
            print(step)

        run_id = run.path_components[1]
        flow_name = run.path_components[0]

        # time to complete
        # steps
        # artifact location?

        data.append({
示例#18
0
def resolve_card(
    ctx,
    pathspec,
    follow_resumed=True,
    hash=None,
    type=None,
):
    """Resolves the card path based on the arguments provided. We allow identifier to be a pathspec or a id of card.

    Args:
        ctx: click context object
        pathspec: pathspec
        hash (optional): This is to specifically resolve the card via the hash. This is useful when there may be many card with same id or type for a pathspec.
        type : type of card
    Raises:
        CardNotPresentException: No card could be found for the pathspec

    Returns:
        (card_paths, card_datastore, taskpathspec) : Tuple[List[str], CardDatastore, str]
    """
    if len(pathspec.split("/")) != 3:
        raise CommandException(
            msg="Expecting pathspec of form <runid>/<stepname>/<taskid>"
        )

    flow_name = ctx.obj.flow.name
    run_id, step_name, task_id = None, None, None
    # what should be the args we expose
    run_id, step_name, task_id = pathspec.split("/")
    pathspec = "/".join([flow_name, pathspec])
    # we set namespace to be none to avoid namespace mismatch error.
    namespace(None)
    task = Task(pathspec)
    print_str = "Resolving card: %s" % pathspec
    if follow_resumed:
        origin_taskpathspec = resumed_info(task)
        if origin_taskpathspec:
            pathspec = origin_taskpathspec
            ctx.obj.echo(
                "Resolving card resumed from: %s" % origin_taskpathspec,
                fg="green",
            )
        else:
            ctx.obj.echo(print_str, fg="green")
    else:
        ctx.obj.echo(print_str, fg="green")
    # to resolve card_id we first check if the identifier is a pathspec and if it is then we check if the `id` is set or not to resolve card_id
    # todo : Fix this with `coalesce function`
    card_paths_found, card_datastore = resolve_paths_from_task(
        ctx.obj.flow_datastore,
        pathspec=pathspec,
        type=type,
        hash=hash,
    )

    if len(card_paths_found) == 0:
        # If there are no files found on the Path then raise an error of
        raise CardNotPresentException(
            flow_name,
            run_id,
            step_name,
            card_hash=hash,
            card_type=type,
        )

    return card_paths_found, card_datastore, pathspec
示例#19
0
def lambda_handler(event, context):

    print(event)

    for record in event['Records']:
        key = record['s3']['object']['key']
        bucket_name = record['s3']['bucket']['name']

        os.environ['METAFLOW_HOME'] = '/tmp'
        os.environ['USERNAME'] = "******"

        obj = {
            'METAFLOW_DEFAULT_METADATA':
            'service',
            'METAFLOW_DEFAULT_DATASTORE':
            's3',
            'METAFLOW_DATASTORE_SYSROOT_S3':
            f"s3://{bucket_name}",
            'METAFLOW_SERVICE_AUTH_KEY':
            "yvhNDfEzcRa5fxKq2ZELda1zk8wNXxMs17Jt4OGs",
            'METAFLOW_SERVICE_URL':
            "https://5sqcgnuyte.execute-api.eu-west-1.amazonaws.com/api/"
        }

        with open('/tmp/config.json', 'w', encoding='utf-8') as f:
            json.dump(obj, f, ensure_ascii=False, indent=4)

        from metaflow import Run, get_metadata, namespace

        namespace(None)
        print(get_metadata())

        step = key.split("/")[2]
        flow = key.split("/")[0]
        run_id = key.split("/")[1]

        run = Run(f"{flow}/{run_id}")

        dynamo_object = {
            "created_at":
            int(
                datetime.strptime(
                    run.created_at.split(".")[0],
                    '%Y-%m-%dT%H:%M:%S').timestamp()),
            "flow_name":
            flow,
            "run_id":
            int(run_id),
            "success":
            run.successful,
            "finished":
            run.finished,
            "finished_at":
            0 if run.finished_at == None else int(
                datetime.strptime(
                    run.finished_at.split(".")[0],
                    '%Y-%m-%dT%H:%M:%S').timestamp()),
            "current_step":
            step,
            "user":
            _parse_tags(run.tags, "user"),
            "tags":
            run.tags,
            "bucket":
            bucket_name
        }

        print(dynamo_object)

        table = dynamodb.Table(EVENTS_RECORD_STORE)

        table.put_item(Item=dynamo_object)

    return