Пример #1
0
def s3_list_objects_at_prefix_v2(bucket, prefix):
    """ List out the objects at this prefix
    Returns a list of the keys found at this bucket.
    We do so because boto objects aren't serializable under multiprocessing

    Note: Use v2 for multi-processing, since this filters on the server side!

    Args:
        bucket(str): The s3 bucket
        prefix(str): The s3 key prefix you wish to search

    Returns:
        (list): List of item keys
    """
    result = []
    client = get_s3_client()

    #print(f"s3_list_objects_at_prefix_v2 the b3[{b3}] and client[{b3.client} and resource[{b3.resource}]")

    try:
        paginator = client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
        for page in page_iterator:
            if 'Contents' not in page:
                continue
            result += [obj['Key'] for obj in page['Contents']]
    except Exception as e:
        _logger.error(
            "s3_list_objects_starting_hex: failed with exception {}".format(e))
        raise
    return result
Пример #2
0
    def rm_bundle_dir(output_path, uuid, db_targets):
        """
        We created a directory (managed path) to hold the bundle and any files.   The files have been
        copied in.   Removing the directory removes any created files.  If the user has told us about
        any DBTargets, also call rm() on those.

        TODO: Integrate with data_context bundle remove.   That deals with information already
        stored in the local DB.

        ASSUMES:  That we haven't actually updated the local DB with information on this bundle.

        Args:
            output_path (str):
            uuid (str):
            db_targets (list(DBTarget)):

        Returns:
            None
        """
        try:
            shutil.rmtree(output_path)

            # if people create s3 files, s3 file targets, inside of an s3 context,
            # then we will have to clean those up as well.

            for t in db_targets:
                t.rm()

        except IOError as why:
            _logger.error(
                "Removal of hyperframe directory {} failed with error {}. Continuing removal..."
                .format(uuid, why))
Пример #3
0
def s3_list_objects_at_prefix(bucket, prefix):
    """ List out the objects at this prefix.
    Returns a list of the keys found at this bucket.
    We do so because boto objects aren't serializable under multiprocessing

    Note: Do *not* use with multi-processing. This version uses boto Collections.
    That means all the filtering is done on the client side, which makes this a
    bad choice for multiprocessing as all the work is done for each call.

    Args:
        bucket(str): The s3 bucket
        prefix(str): The s3 key prefix you wish to search

    Returns:
        (list): List of item keys
    """
    s3 = get_s3_resource()
    result = []
    try:
        s3_b = s3.Bucket(bucket)
        for i in s3_b.objects.filter(Prefix=prefix, MaxKeys=1024):
            result.append(i)
    except Exception as e:
        _logger.error(
            "s3_list_objects_starting_hex: failed with exception {}".format(e))
        raise
    return result
Пример #4
0
def _get_context(context_name):
    """Retrieve data context given name.   Raise exception if not found.

    Args:
        context_name(str): <remote context>/<local context> or <local context>

    Returns:
        (`disdat.data_context.DataContext`)
    """

    fs = _get_fs()

    data_context = fs.get_context(context_name)

    if data_context is None:
        # Try once to see if needs to be loaded
        data_context = fs.reload_context(context_name)

    if data_context is None:
        error_msg = "Unable to perform operation: could not find context {}".format(
            context_name)
        _logger.error(error_msg)
        raise RuntimeError(error_msg)

    return data_context
Пример #5
0
def convert_str_params(cls, params_str):
    """
    This is similar to Luigi.Task.from_str_params(cls, params_str)
    But we don't create the class here, and we outer loop through our params (not the classes
    params).  We just want to convert each of the params that are in the class and in this dictionary
    into the deserialized form.

    NOTE:  This is somewhat dangerous and could break if Luigi changes around
    this code.  The alternative is to use Luigi.load_task() but then we have to ensure
    all the input parameters are "strings" and we have to then put special code
    inside of apply to know when to create a class normally, or create it from the CLI.

    Parameters:
        params_str (dict): dict of str->str.  param name -> value .
    """
    kwargs = {}

    cls_params = {n: p
                  for n, p in cls.get_params()
                  }  # get_params() returns [ (name, param), ... ]

    for param_name, param_str in params_str.items():
        if param_name in cls_params:
            param = cls_params[param_name]
            if isinstance(param_str, list):
                kwargs[param_name] = param._parse_list(param_str)
            else:
                kwargs[param_name] = param.parse(param_str)
        else:
            _logger.error("Parameter {} is not defined in class {}.".format(
                param_name, cls.__name__))
            raise ValueError("Parameter {} is not defined in class {}.".format(
                param_name, cls.__name__))

    return kwargs
Пример #6
0
    def __init__(self, local_context, name, owner=''):
        """ Given name and a local context, create a handle that users can
        work with to:
        a.) Create files / directories / dbtables (i.e., Bundle Links)
        b.) Add constants and files into bundle

        Create the bundle ahead of time, and add items to it.
        Or use a temp dir and copy things into the bundle when you're done.
        If #2 then, it's easy to use a bundle object and write it to multiple
        contexts.   We should close or destroy a bundle in case 2.

        Args:
            local_context (str): Where this bundle will be output or where it was sourced from.
            name (str): Human name for this bundle
        """

        self.fs = _get_fs()

        try:
            self.data_context = self.fs.get_context(local_context)
        except Exception as e:
            _logger.error("Unable to allocate bundle in context: {} ".format(local_context, e))
            return

        super(Bundle, self).__init__(human_name=name, owner=owner)

        self.local_dir = None
        self.remote_dir = None

        self.open = False
        self.closed = False

        self.depends_on = []    # list of tuples (processing_name, uuid) of bundles on which this bundle depends
        self.data = None        # The df, array, dictionary the user wants to store
Пример #7
0
    def create_output_dir(self, dirname):
        """
        Disdat Pipe API Function

        Given basename directory name, return a fully qualified path whose prefix is the
        local output directory for this bundle in the current context.  This call creates the
        output directory as well.

        Args:
            dirname (str): The name of the output directory, i.e., "models"

        Returns:
            output_dir (str):  Fully qualified path of a directory whose prefix is the bundle's local output directory.

        """

        prefix_dir = self.get_output_dir()
        fqp = os.path.join(prefix_dir, dirname)
        try:
            os.makedirs(fqp)
        except IOError as why:
            _logger.error(
                "Creating directory in bundle directory failed:".format(why))

        return fqp
Пример #8
0
    def open(self, force_uuid=None):
        """ Management operations to open bundle for writing.
        At this time all of the open operations, namely creating the managed path
        occur in the default constructor or in the class fill_from_hfr constructor.

        Args:
            force_uuid (str): DEPRECATING - do not use.  Force to open a bundle with a specific bundle.

        Returns:
            Bundle
        """
        if self._closed:
            _logger.error("Bundle is closed -- unable to re-open.")
            assert False
        self._local_dir, self.pb.uuid, self._remote_dir = self.data_context.make_managed_path(
            uuid=force_uuid)
        return self
Пример #9
0
def ls_s3_url_objects(s3_url):
    """
    Return aws boto3 ObjectSummary's

    Note: There is no current way in boto3 to do globs -- you filter on the client side.

    Returns:
        list:str: list of ObjectSummary's under this path
    """
    result = []

    if s3_url[-1] is not '/':
        s3_url += '/'

    bucket, s3_path = split_s3_url(s3_url)

    #if not s3_bucket_exists(bucket):
    #    return result

    if False:
        client = b3.client('s3')
        paginator = client.get_paginator('list_objects_v2')
        # use delimiter to groupby, which means, list things only at this level.
        #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path)
        page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path)
        for page in page_iterator:
            result += [obj['Key'] for obj in page['Contents']]
    else:
        s3 = b3.resource('s3')
        try:
            s3_b = s3.Bucket(bucket)
            for i in s3_b.objects.filter(Prefix=s3_path, MaxKeys=1024):
                result.append(i)
            if len(result) == 1024:
                _logger.warn(
                    "ls_s3_url_objects: hit MaxKeys 1024 limit in result set.")
        except Exception as e:
            _logger.error(
                "ls_s3_url_objects: failed with exception {}".format(e))
            raise

    return result
Пример #10
0
    def make_directory(self, dir_name):
        """ Returns path `<disdat-managed-directory>/<dir_name>`.  This is used if you need
        to hand a process an output directory and you do not have control of what it writes in
        that directory.

        Add this path as you would add file paths to your output bundle.  Disdat will incorporate
        all the data found in this directory into the bundle.

        See Pipe.create_output_dir()

        Arguments:
            dir_name (str): Either a FQP (prefix is the bundle path) or a basedir of a directory to appear in the bundle.  Neither should end in /

        Returns:
            str: A directory path managed by disdat
        """
        assert (self.open and not self.closed)

        # remove the prefix iff it exists
        dst_base_path = dir_name.replace(self.local_dir, '')

        # if the user erroneously passes in the directory of the bundle, return same
        if dst_base_path == '':
            return self.local_dir

        fqp = os.path.join(self.local_dir, dst_base_path.lstrip('/'))
        try:
            os.makedirs(fqp)
        except OSError as why:
            if not why.errno == errno.EEXIST:
                _logger.error(
                    "Creating directory in bundle directory failed errno {}".
                    format(why.strerror))
                raise
            # else directory exists is OK and fall through
        except IOError as why:
            _logger.error(
                "Creating directory in bundle directory failed {}".format(why))
            raise

        return fqp
Пример #11
0
    def get_directory(self, dir_name):
        """ Returns path `<disdat-managed-directory>/<dir_name>`.  This gives the user a local
        output directory into which to write files.  This is useful when a user needs to give an external tool, such
        as Spark or Tensorflow, a directory to place output files.   After this call, the directory
        will exist in the local context.

        It is the user's responsibility to add individual file links to the bundle.

        It is an error to add a directory as a file link.

        Arguments:
            dir_name (str): A basedir of a directory to appear in the local bundle.

        Returns:
            str: A directory path managed by disdat
        """
        self._check_open()

        if dir_name[-1] == '/':
            dir_name = dir_name[:-1]

        # if the user erroneously passes in the directory of the bundle, return same
        if dir_name == self._local_dir:
            return self._local_dir

        fqp = os.path.join(self._local_dir, dir_name.lstrip('/'))

        try:
            os.makedirs(fqp)
        except OSError as why:
            if not why.errno == errno.EEXIST:
                _logger.error(
                    "Creating directory in bundle directory failed errno {}".
                    format(why.strerror))
                raise
        except IOError as why:
            _logger.error(
                "Creating directory in bundle directory failed {}".format(why))
            raise

        return fqp
Пример #12
0
 def abandon(self):
     """ Remove on-disk state of the bundle if it is abandoned before it is closed.
      that were left !closed have their directories harvested.
      NOTE: the user has the responsibility to make sure the bundle is not shared across
      threads or processes and that they don't remove a directory out from under another
      thread of control.  E.g., you cannot place this code in __del__ and then _check_closed() b/c
      a forked child process might have closed their copy while the parent deletes theirs.
      """
     self._check_open()
     _logger.debug(
         f"Disdat api abandon bundle obj [{id(self)}] process[{os.getpid()}] uuid[{self.uuid}]"
     )
     try:
         shutil.rmtree(self._local_dir, ignore_errors=True)
         os.rmdir(self._local_dir)
         # TODO: if people create s3 files, s3 file targets, inside of an s3 context,
         # TODO: then we will have to clean those up as well.
     except IOError as why:
         _logger.error(
             "Removal of bundle directory {} failed with error {}. Continuing removal..."
             .format(self._local_dir, why))
Пример #13
0
    def add_bundle_meta_files(pipe_task):
        """
        Given a pipe or driver task, create the bundle metaoutput files and Luigi
        output targets for them.

        Use the pipe_task (or driver task) to get the name of the bundle.
        Use the name of the bundle to look up the output path in the pipe cache in the
        PipeFS class object.

        Create an hframe.  The individual frame records have to be written out before hand.

        Args:
            pipe_task: The pipe task that will use these outputs

        Returns:
            [ luigi output for meta file, luigi output for lineage file ]

        """
        pce = DisdatFS.get_path_cache(pipe_task)

        if pce is None:
            # This can happen when the pipe has been created with non-deterministic parameters
            _logger.error(
                "add_bundle_meta_files: could not find pce for task {}".format(
                    pipe_task.pipe_id()))
            _logger.error(
                "It is possible one of your tasks is parameterized in a non-deterministic fashion."
            )
            raise Exception(
                "add_bundle_meta_files: Unable to find pce for task {}".format(
                    pipe_task.pipe_id()))

        hframe = {
            PipeBase.HFRAME:
            luigi.LocalTarget(
                os.path.join(pce.path,
                             HyperFrameRecord.make_filename(pce.uuid)))
        }

        return hframe
Пример #14
0
    def put_path_cache(pipe_instance,
                       bundle,
                       uuid,
                       path,
                       rerun,
                       overwrite=False):
        """  The path cache is used to associate a pipe instance with its output path and whether
        we have decided to re-run this pipe.   If rerun is True, then there should be no
        ouput at this path.  AND it should eventually be added as a new version of this bundle.

        Args:
            pipe_instance:     instance of a pipe
            bundle (disdat.api.Bundle):  The bundle to hold the output data and metadata
            uuid:              specific uuid of the output path
            path:              where to write the bundle
            rerun:             whether or not we are re-running or re-using
            overwrite:         overwrite existing entry (if exists)

        Returns:
            pce or raise KeyError
        """
        pipe_name = pipe_instance.processing_id()
        pce = PathCacheEntry(pipe_instance, bundle, uuid, path, rerun)
        if pipe_name not in PathCache.task_path_cache:
            PathCache.task_path_cache[pipe_name] = pce
        else:
            if pce == PathCache.task_path_cache[
                    pipe_name]:  # The tuples are identical
                _logger.error(
                    "path_cache dup key: pipe {} already bound to same PCE {} "
                    .format(pipe_name, pce))
            else:
                if overwrite:
                    PathCache.task_path_cache[pipe_name] = pce
                else:
                    raise KeyError(
                        "path_cache dup key: pipe {} bound to pce {} but trying to re-assign to {}"
                        .format(pipe_name,
                                PathCache.task_path_cache[pipe_name], pce))
        return pce
Пример #15
0
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name,
                   aws_session_token_duration, vcpus, memory, no_submit,
                   job_role_arn):
    """
    Run job on AWS Batch.   Sends to queue configured in disdat.cfg.
    This assumes that you have already created a cluster that will run the jobs
    that have been assigned to that queue.

    Args:
        arglist:
        fq_repository_name (str): The fully qualified docker repository name
        job_name:
        pipeline_image_name:
        aws_session_token_duration:
        vcpus:
        memory:
        no_submit (bool): default False
        job_role_arn (str): Can be None

    Returns:

    """
    def check_role_arn(job_dict, jra):
        """ Check to see if the job desc dictionary contains the same job_role_arn (jra)
        """

        if jra is None:
            if 'jobRoleArn' not in job_dict['containerProperties']:
                return True
        else:
            if 'jobRoleArn' in job_dict['containerProperties']:
                if job_dict['containerProperties']['jobRoleArn'] == jra:
                    return True
        return False

    disdat_config = DisdatConfig.instance()

    # Get the parameter values required to kick off an AWS Batch job.
    # Every batch job must:
    # 1. Have a name
    # 2. Have a job definition that declares which ECR-hosted Docker
    #    image to use.
    # 3. Have a queue that feeds jobs into a compute cluster.
    # 4. The command to execute inside the Docker image; the command
    #    args are more-or-less the same as the ones used to execute
    #    locally using 'dsdt run'

    # Create a Job Definition and upload it.
    # We create per-user job definitions so multiple users do not clobber each other.
    # In addition, we never re-use a job definition, since the user may update
    # the vcpu or memory requirements and those are stuck in the job definition

    job_definition_name = aws.batch_get_job_definition_name(
        pipeline_image_name)

    if disdat_config.parser.has_option(_MODULE_NAME,
                                       'aws_batch_job_definition'):
        job_definition_name = disdat_config.parser.get(
            _MODULE_NAME, 'aws_batch_job_definition')

    # TODO: Look through all of history to find one that matches?
    # TODO: Delete old jobs here or let user do it?
    job_definition_obj = aws.batch_get_latest_job_definition(
        job_definition_name)

    if (job_definition_obj is not None
            and job_definition_obj['containerProperties']['image']
            == fq_repository_name
            and job_definition_obj['containerProperties']['vcpus'] == vcpus
            and job_definition_obj['containerProperties']['memory'] == memory
            and check_role_arn(job_definition_obj, job_role_arn)):

        job_definition_fqn = aws.batch_extract_job_definition_fqn(
            job_definition_obj)

        _logger.info("Re-using prior AWS Batch run job definition : {}".format(
            job_definition_obj))

    else:
        """ Whether None or doesn't match, make a new one """

        job_definition_obj = aws.batch_register_job_definition(
            job_definition_name,
            fq_repository_name,
            vcpus=vcpus,
            memory=memory,
            job_role_arn=job_role_arn)

        job_definition_fqn = aws.batch_get_job_definition(job_definition_name)

        _logger.info(
            "New AWS Batch run job definition {}".format(job_definition_fqn))

    if no_submit:
        # Return the job description object
        return job_definition_obj

    job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

    container_overrides = {'command': arglist}

    # Through the magic of boto3_session_cache, the client in our script
    # here can get at AWS profiles and temporary AWS tokens created in
    # part from MFA tokens generated through the user's shells; we don't
    # have to write special code of our own to deal with authenticating
    # with AWS.
    client = b3.client('batch', region_name=aws.profile_get_region())
    # A bigger problem might be that the IAM role executing the job on
    # a batch EC2 instance might not have access to the S3 remote. To
    # get around this, allow the user to create some temporary AWS
    # credentials.

    if aws_session_token_duration > 0 and job_role_arn is None:
        sts_client = b3.client('sts')
        try:
            token = sts_client.get_session_token(
                DurationSeconds=aws_session_token_duration)
            credentials = token['Credentials']
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials['AccessKeyId']
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials['SecretAccessKey']
            }, {
                'name':
                'AWS_SESSION_TOKEN',
                'value':
                credentials['SessionToken']
            }]
        except Exception as e:
            _logger.debug(
                "Unable to generate an STS token, instead trying users default credentials..."
            )
            credentials = b3.session.Session().get_credentials()
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials.access_key
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials.secret_key
            }, {
                'name': 'AWS_SESSION_TOKEN',
                'value': credentials.token
            }]

    container_overrides['environment'].append({
        'name': 'DISDAT_CPU_COUNT',
        'value': str(vcpus)
    })

    job = client.submit_job(jobName=job_name,
                            jobDefinition=job_definition_fqn,
                            jobQueue=job_queue,
                            containerOverrides=container_overrides)

    status = job['ResponseMetadata']['HTTPStatusCode']
    if status == 200:
        _logger.info(
            'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'
            .format(job['jobName'], job['jobId'], job_definition_fqn,
                    job_queue))
        return job
    else:
        _logger.error('Job submission failed: HTTP Status {}'.format())
        return None
Пример #16
0
    def run(self):
        """

        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            (`hyperframe.HyperFrame`):

        """

        kwargs = self.prepare_pipe_kwargs(for_run=True)

        pce = self.pfs.get_path_cache(self)

        assert (pce is not None)

        try:
            start = time.time()  #P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  #P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            presentation, frames = PipeBase.parse_return_val(
                pce.uuid, user_rtn_val, self.data_context)

            hfr = PipeBase.make_hframe(frames,
                                       pce.uuid,
                                       self.bundle_inputs(),
                                       self.pipeline_id(),
                                       self.pipe_id(),
                                       self,
                                       start_ts=start,
                                       stop_ts=stop,
                                       tags={"presentable": "True"},
                                       presentation=presentation)

            # Add Luigi Task parameters -- Only add the class parameters.  These are Disdat special params.
            self.user_tags.update(self._get_subcls_params(self))

            if self.output_tags:
                self.user_tags.update(self.output_tags)

            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})

            if self.user_tags:
                hfr.replace_tags(self.user_tags)

            self.data_context.write_hframe(hfr)

            transient = False
            if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None:
                transient = True

            if self.incremental_push and not transient:
                self.pfs.commit(None,
                                None,
                                uuid=pce.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        return hfr
Пример #17
0
def error(msg, *args, **kwargs):
    _logger.error(msg, *args, **kwargs)
    sys.exit(1)
Пример #18
0
    def __init__(self,
                 local_context,
                 name=None,
                 data=None,
                 processing_name=None,
                 owner=None,
                 tags=None,
                 params=None,
                 dependencies=None,
                 code_method=None,
                 vc_info=None,
                 start_time=0,
                 stop_time=0):
        """ Create a bundle in a local context.

        There are three ways to create bundles:

        1.) Create a bundle with a single call.  Must include a data field!
        b = api.Bundle('examples', name='propensity_model',owner='fred',data='/Users/fred/model.tgz')

        2.) Create a bundle using a context manager.  The initial call requires only a context.
        with api.Bundle('examples') as b:
            b.add_data(file_list)
            b.add_code_ref('mymodule.mymethod')
            b.add_params({'path': path})
            b.add_tags(tags)

        Users can query the bundle object to create output files directly in the
        referred-to context.  They may also add tags, parameters, code and git info, and start/stop times.
        Once the bundles is "closed" via the context manager, it will be written to disk and immutable.
        Note that one may change anything about an "open" bundle except the context information.

        3.) Open and close manually.
        b = api.Bundle('examples').open()
        b.add_data(file_list)
        b.add_code_ref('mymodule.mymethod')
        b.add_params({'path': path})
        b.add_tags(tags)
        b.close()

        Default name:  If you don't provide a name, Disdat tries to use the basename in `code_ref`.
        Default processing_name:   If you don't provide a processing name, Disdat will use a default that
        takes into consideration your bundles upstream inputs, parameters, and code reference.

        Args:
            local_context (Union[str, `disdat.data_context.DataContext`): The local context name or context object
            name (str): Human name for this bundle.
            data (union(pandas.DataFrame, tuple, None, list, dict)):  The data this bundle contains.
            processing_name (str):  A name that indicates a bundle was made in an identical fashion.
            owner (str):  The owner of the bundle.  Default getpass.getuser()
            tags (dict):  (str,str) dictionary of arbitrary user tags.
            params (dict(str:str)): Dictionary of parameters that <code_method> used to produce this output.
            dependencies (dict(str:bundle)):  Dictionary of argname: bundle, Bundles used to produce this output.
            code_method (str):  A reference to code that created this bundle. Default None
            vc_info (tuple):  Version control information triple: e.g. tuple(git_repo , git_commit, branch)
            start_time (float):  Start time of the process that produced the bundle. Default time.now()
            stop_time (float):   Stop time of the process that produced the bundle. Default time.now()

        """

        self._fs = _get_fs()

        try:
            if isinstance(local_context, DataContext):
                self.data_context = local_context
            elif isinstance(local_context, str):
                self.data_context = self._fs.get_context(local_context)
                if self.data_context is None:
                    raise Exception(
                        "Unable to create Bundle: no context found with name[{}]"
                        .format(local_context))
            else:
                raise Exception(
                    "Unable to create Bundle: local_context is not str or DataContext"
                )
        except Exception as e:
            _logger.error("Unable to allocate bundle in context: {} ".format(
                local_context, e))
            return

        self._local_dir = None
        self._remote_dir = None
        self._closed = False  # Bundle is closed and immutable
        self._data = None  # The df, array, dictionary the user wants to store

        super(Bundle, self).__init__(
            human_name=name,  #'' if name is None else name,
            owner=getpass.getuser() if owner is None else owner,
            processing_name=
            processing_name,  #'' if processing_name is None else processing_name
        )

        # Add the fields they have passed in.
        if tags is not None:
            self.add_tags(tags)
        if params is not None:
            self.add_params(params)
        if dependencies is not None:
            self.add_dependencies(dependencies.values(), dependencies.keys())
        if code_method is not None:
            self.add_code_ref(code_method)
        if vc_info is not None:
            self.add_git_info(vc_info)
        self.add_timing(start_time, stop_time)

        # Only close and make immutable if the user also adds the data field
        if data is not None:
            self.open()
            self.add_data(data)
            self.close()
Пример #19
0
def _run(output_bundle='-',
         pipeline_root='',
         pipeline_args='',
         pipe_cls=None,
         backend=None,
         input_tags={},
         output_tags={},
         force=False,
         context=None,
         remote=None,
         no_pull=False,
         no_push=False,
         no_push_int=False,
         vcpus=1,
         memory=2000,
         workers=1,
         no_submit=False,
         job_role_arn=None,
         aws_session_token_duration=0,
         cli=False):
    """Run the dockerized version of a pipeline.

    Note these are named parameters so we avoid bugs related to argument order.

    Args:
        output_bundle (str): The human name of the output bundle
        pipeline_root (str): The path to the setup.py used to create the container
        pipeline_args: Optional arguments to pass to the pipeline class
        pipe_cls: Name of the pipeline class to run
        backend: The batch execution back-end to use (default
            `Backend.Local`)
        input_tags (list(str)): Find bundle with these tags ['key:value',...]
        output_tags (list(str)): Push result bundle with these tags ['key:value',...]
        force (bool): If `True` force recomputation of all upstream pipe requirements (default `False`)
        context (str): <remote context>/<local context> context string
        remote (str): The remote S3 URL.
        no_pull (bool): Do not pull before executing (start in empty local context)
        no_push (bool): Do not push any new bundles to remote (useful for testing locally)
        no_push_int (bool): Do not push new intermediate bundles to remote
        vcpus (int):  Number of AWS vCPUs the container requests
        memory (int):  Amount of memory container requests in MB
        workers (int): Number of Luigi workers to run tasks in DAG
        no_submit (bool): Produce the AWS job config (for AWS Batch), but do not submit the job
        job_role_arn (str):  The AWS role under which the job should execute
        aws_session_token_duration (int): the number of seconds our temporary credentials should last.
        cli (bool): Whether we called run from the API (buffer output) or the CLI

    Returns:
        job_result (json): A json blob that contains information about the run job.  Error with empty dict.  If backend
        is Sagemaker, return TrainingJobArn.   If backend is AWSBatch, return Batch Job description.   If local, return stdout.
    """

    pfs = fs.DisdatFS()

    pipeline_setup_file = os.path.join(pipeline_root, 'setup.py')

    if not common.setup_exists(pipeline_setup_file):
        return None

    output_bundle_uuid = pfs.disdat_uuid()
    if remote is None or context is None:
        remote, context = common.get_run_command_parameters(pfs)

    if remote is None and (not no_push or not no_pull
                           ):  # if pulling or pushing, need a remote
        _logger.error(
            "Pushing or pulling bundles with 'run' requires a remote set with `dsdt remote <s3 url>`"
        )
        return

    arglist = common.make_run_command(output_bundle, output_bundle_uuid,
                                      pipe_cls, remote, context, input_tags,
                                      output_tags, force, no_pull, no_push,
                                      no_push_int, workers, pipeline_args)

    if backend == Backend.AWSBatch or backend == Backend.SageMaker:

        pipeline_image_name = common.make_project_image_name(
            pipeline_setup_file)

        job_name = '{}-{}'.format(pipeline_image_name, int(time.time()))

        fq_repository_name = get_fq_docker_repo_name(False,
                                                     pipeline_setup_file)

        if backend == Backend.AWSBatch:

            retval = _run_aws_batch(arglist, fq_repository_name, job_name,
                                    pipeline_image_name,
                                    aws_session_token_duration, vcpus, memory,
                                    no_submit, job_role_arn)
        else:

            fq_repository_name = get_fq_docker_repo_name(True, pipeline_root)

            retval = _run_aws_sagemaker(arglist, fq_repository_name, job_name)

    elif backend == Backend.Local or backend == Backend.LocalSageMaker:
        retval = _run_local(cli, pipeline_setup_file, arglist, backend)

    else:
        raise ValueError(
            'Got unrecognized job backend \'{}\': Expected {}'.format(
                backend, Backend.options()))

    return retval
Пример #20
0
    def run(self):
        """
        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            None
        """
        kwargs = self.prepare_pipe_kwargs(for_run=True)
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), 
        then running requires() post run() will give different tasks.  To be safe we record the inputs before run() 
        """
        cached_bundle_inputs = self.bundle_inputs()

        try:
            start = time.time()  # P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  # P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                pce.bundle.abandon()
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            # Add any output tags to the user tag dict
            if self.output_tags:
                self.user_tags.update(self.output_tags)

            # If this is the root_task, identify it as so in the tag dict
            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})
            """ if we have a pce, we have a new bundle that we need to add info to and close """
            pce.bundle.add_data(user_rtn_val)

            pce.bundle.add_timing(start, stop)

            pce.bundle.add_dependencies(cached_bundle_inputs.values(),
                                        cached_bundle_inputs.keys())

            pce.bundle.name = self.human_id()

            pce.bundle.processing_name = self.processing_id()

            pce.bundle.add_params(self._get_subcls_params())

            pce.bundle.add_tags(self.user_tags)

            pce.bundle.add_code_ref('{}.{}'.format(self.__class__.__module__,
                                                   self.__class__.__name__))

            pipeline_path = os.path.dirname(
                sys.modules[self.__class__.__module__].__file__)
            cv = DisdatFS.get_pipe_version(pipeline_path)
            pce.bundle.add_git_info(cv.url, cv.hash, cv.branch)

            pce.bundle.close()  # Write out the bundle
            """ Incrementally push the completed bundle """
            if self.incremental_push and (BUNDLE_TAG_TRANSIENT
                                          not in pce.bundle.tags):
                self.pfs.commit(None,
                                None,
                                uuid=pce.bundle.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            pce.bundle.abandon()
            raise

        return None
Пример #21
0
    def add_external_dependency(self,
                                param_name,
                                task_class,
                                params,
                                human_name=None,
                                uuid=None):
        """
        Disdat Pipe API Function

        Add an external task and its parameters to our requirements.   What this means is that
        there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls
        flatten(self.requires())).  And what that means is that this requirement can only be satisfied
        by the bundle actually existing.

        Create ersatz ExternalDepTask parameterized by uuid and processing_name
        Note: it is possible to use class/params when searching by class, params, but this makes all external
        dependencies look the same in the code.  Win.

        NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during
        execution and that your requires function is no longer deterministic.   You must add caching to your
        requires function to handle this scenario.

        Example with class variable bundle_uuid:
        ``
        if self.bundle_uuid is None:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result')
            self.bundle_uuid = bundle.uuid
        else:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid)
        ``

        TODO: Consider pushing caching into this layer.

        Args:
            param_name (str): The parameter name this bundle assumes when passed to Pipe.run
            task_class (object):  Class name of upstream task if looking for external bundle by processing_id.
            params (dict):  Dictionary of parameters if looking for external bundle by processing_id.
            human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name.  Trumps task_class and params.
            uuid (str): Resolve dependency by explicit UUID, trumps task_class, params and human_name.

        Returns:
            `api.Bundle` or None

        """
        import disdat.api as api

        if task_class is not None and not isinstance(params, dict):
            error = "add_external_dependency requires parameter dictionary"
            raise Exception(error)

        assert (param_name not in self.add_deps)

        try:
            if uuid is not None:
                hfr = self.pfs.get_hframe_by_uuid(
                    uuid, data_context=self.data_context)
            elif human_name is not None:
                hfr = self.pfs.get_latest_hframe(
                    human_name, data_context=self.data_context)
            else:
                # we propagate the same inputs and the same output dir for every upstream task!
                params.update({
                    'user_arg_name': param_name,
                    'data_context': self.data_context
                })
                p = task_class(**params)
                hfr = self.pfs.get_hframe_by_proc(
                    p.processing_id(), data_context=self.data_context)

            if hfr is None:
                error_str = "Disdat can't resolve external bundle from class[{}] params[{}] name[{}] uuid[{}]".format(
                    task_class, params, human_name, uuid)
                raise ExtDepError(error_str)

            bundle = api.Bundle(
                self.data_context.get_local_name()).fill_from_hfr(hfr)

        except ExtDepError as error:  # Swallow and allow Luigi to determine task is not available.
            _logger.error(error_str)
            bundle = None

        except Exception as error:
            _logger.error(error)
            bundle = None

        finally:
            if bundle is None:
                self.add_deps[param_name] = (
                    luigi.task.externalize(ExternalDepTask), {
                        'uuid': 'None',
                        'processing_name': 'None'
                    })
            else:
                self.add_deps[param_name] = (
                    luigi.task.externalize(ExternalDepTask), {
                        'uuid': bundle.uuid,
                        'processing_name': bundle.processing_name
                    })

        return bundle
Пример #22
0
def _run_local(cli, pipeline_setup_file, arglist, backend):
    """
    Run container locally or run sagemaker container locally
    Args:
        cli (bool): Whether we were called from the CLI or API
        pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline.
        arglist:
        backend:

    Returns:
        output (str): Returns None if there is a failure

    """

    on_macos = False
    if platform == "darwin":
        on_macos = True

    client = docker.from_env()

    environment = {}
    if 'AWS_PROFILE' in os.environ:
        environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']

    environment[common.LOCAL_EXECUTION] = 'True'

    # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this
    #environment['DISDAT_CPU_COUNT'] = vcpus

    volumes = {}
    aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                               os.path.join(os.environ['HOME'], '.aws'))
    if aws_config_dir is not None and os.path.exists(aws_config_dir):
        volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}

    local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir()
    volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'}

    try:
        if backend == Backend.LocalSageMaker:
            pipeline_image_name = common.make_sagemaker_project_image_name(
                pipeline_setup_file)
            tempdir = tempfile.mkdtemp()
            with open(os.path.join(tempdir, 'hyperparameters.json'),
                      'w') as of:
                json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of)
                args = ['train']  # rewrite to just 'train'
                # On mac OS, tempdir returns /var, but is actually /private/var
                # Add /private since it that dir is shared (and not /var) with Docker.
                if on_macos:
                    localdir = os.path.join('/private', tempdir[1:])
                else:
                    localdir = tempdir
                volumes[localdir] = {
                    'bind': '/opt/ml/input/config/',
                    'mode': 'rw'
                }
                _logger.info("VOLUMES: {}".format(volumes))
        else:
            # Add the actual command to the arglist (for non-sagemaker runs)
            arglist = [ENTRYPOINT_BIN] + arglist
            pipeline_image_name = common.make_project_image_name(
                pipeline_setup_file)

        _logger.debug('Running image {} with arguments {}'.format(
            pipeline_image_name, arglist))

        stdout = client.containers.run(pipeline_image_name,
                                       arglist,
                                       detach=False,
                                       environment=environment,
                                       init=True,
                                       stderr=True,
                                       volumes=volumes)
        stdout = six.ensure_str(stdout)
        if cli: print(stdout)
        return stdout
    except docker.errors.ContainerError as ce:
        _logger.error(
            "Internal error running image {}".format(pipeline_image_name))
        _logger.error("Error: {}".format(six.ensure_str(ce.stderr)))
        return six.ensure_str(ce)
    except docker.errors.ImageNotFound:
        _logger.error(
            "Unable to find the docker image {}".format(pipeline_image_name))
        return None
Пример #23
0
 def assert_or_log(cli, msg):
     if cli:
         _logger.error(msg)
     else:
         assert False, msg
Пример #24
0
    def run(self):
        """

        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            (`hyperframe.HyperFrame`):

        """

        kwargs = self.prepare_pipe_kwargs(for_run=True)

        pce = self.pfs.get_path_cache(self)

        assert (pce is not None)
        """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), 
        then running requires() post run() will give different tasks.  To be safe we record the inputs before run() 
        """
        cached_bundle_inputs = self.bundle_inputs()

        try:
            start = time.time()  # P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  # P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            presentation, frames = PipeBase.parse_return_val(
                pce.uuid, user_rtn_val, self.data_context)

            hfr = PipeBase.make_hframe(frames,
                                       pce.uuid,
                                       cached_bundle_inputs,
                                       self.pipeline_id(),
                                       self.pipe_id(),
                                       self,
                                       start_ts=start,
                                       stop_ts=stop,
                                       tags={"presentable": "True"},
                                       presentation=presentation)

            # Add any output tags to the user tag dict
            if self.output_tags:
                self.user_tags.update(self.output_tags)

            # If this is the root_task, identify it as so in the tag dict
            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})

            # Lastly add any parameters associated with this class as tags.
            # They are differentiated by a special prefix in the key
            self.user_tags.update(self._get_subcls_params())

            # Overwrite the hyperframe tags with the complete set of tags
            hfr.replace_tags(self.user_tags)

            self.data_context.write_hframe(hfr)

            transient = False
            if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None:
                transient = True

            if self.incremental_push and not transient:
                self.pfs.commit(None,
                                None,
                                uuid=pce.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        return hfr