def ecr_create_fq_respository_name(repository_name, policy_resource_package=None, policy_resource_name=None): ecr_client = b3.client('ecr', region_name=profile_get_region()) # Create or fetch the repository in AWS (to store the image) try: response = ecr_client.create_repository( repositoryName=repository_name ) repository_metadata = response['repository'] # Set the policy on the repository if policy_resource_package is not None and policy_resource_name is not None: policy = pkg_resources.resource_string(policy_resource_package.__name__, policy_resource_name) _ = ecr_client.set_repository_policy( registryId=repository_metadata['registryId'], repositoryName=repository_name, policyText=policy, force=True ) except ClientError as e: if e.response['Error']['Code'] == 'RepositoryAlreadyExistsException': response = ecr_client.describe_repositories( repositoryNames=[repository_name] ) repository_metadata = response['repositories'][0] elif e.response['Error']['Code'] == 'AccessDeniedException': _logger.warn("Error [AccessDeniedException] when creating repo {}, trying to continue...".format(repository_name)) else: raise e return repository_metadata['repositoryUri']
def get_s3_key(bucket, key, filename=None): """ Args: bucket: key: file_name: s3: A boto3.resource('s3') Returns: """ #print "PID({}) START bkt[] key[{}] file[{}]".format(multiprocessing.current_process(),key,filename) dl_retry = 3 s3 = b3.resource('s3') if filename is None: filename = os.path.basename(key) else: path = os.path.dirname(filename) if not os.path.exists(path): try: os.makedirs(path) except os.error as ose: # swallow error -- likely directory already exists from other process _logger.debug("aws_s3.get_s3_key: Error code {}".format( os.strerror(ose.errno))) while dl_retry > 0: try: s3.Bucket(bucket).download_file(key, filename) dl_retry = -1 except Exception as e: _logger.warn( "aws_s3.get_s3_key Retry Count [{}] on download_file raised exception {}" .format(dl_retry, e)) dl_retry -= 1 if dl_retry <= 0: _logger.warn( "aws_s3.get_s3_key Fail on downloading file after 3 retries with exception {}" .format(e)) raise #print "PID({}) STOP bkt[] key[{}] file[{}]".format(multiprocessing.current_process(),key,filename) return filename
def add_tags(self, tags): """ Add tag to our set of input tags Args: k,v (dict): string:string dictionary Returns: self """ if not self.open: _logger.warn("Open the bundle to modify tags.") return if self.closed: _logger.warn("Unable to modify tags in a closed bundle.") return super(Bundle, self).add_tags(tags) return self
def ls_s3_url_objects(s3_url): """ Return aws boto3 ObjectSummary's Note: There is no current way in boto3 to do globs -- you filter on the client side. Returns: list:str: list of ObjectSummary's under this path """ result = [] if s3_url[-1] is not '/': s3_url += '/' bucket, s3_path = split_s3_url(s3_url) #if not s3_bucket_exists(bucket): # return result if False: client = b3.client('s3') paginator = client.get_paginator('list_objects_v2') # use delimiter to groupby, which means, list things only at this level. #page_iterator = paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=s3_path) page_iterator = paginator.paginate(Bucket=bucket, Prefix=s3_path) for page in page_iterator: result += [obj['Key'] for obj in page['Contents']] else: s3 = b3.resource('s3') try: s3_b = s3.Bucket(bucket) for i in s3_b.objects.filter(Prefix=s3_path, MaxKeys=1024): result.append(i) if len(result) == 1024: _logger.warn( "ls_s3_url_objects: hit MaxKeys 1024 limit in result set.") except Exception as e: _logger.error( "ls_s3_url_objects: failed with exception {}".format(e)) raise return result
def _open(self): """ Add the bundle to this local context Note, we don't need to checkout this context, we just need the relevant context object. Args: Returns: None """ if self.open: _logger.warn("Bundle is already open.") return elif not self.closed: self.local_dir, self.pb.uuid, self.remote_dir = self.data_context.make_managed_path() self.open = True else: _logger.warn("Bundle is closed -- unable to re-open.") return self
def apply(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force, output_bundle_uuid=None, central_scheduler=False, workers=1, data_context=None, incremental_push=False, incremental_pull=False): """ Given an input bundle, run the pipesline on the bundle. Note, we first make a copy of all tasks that are parameterized identically to the tasks we will run. This is so we can figure out what we will need to re-run. This is why we make a single uuid for the output bundle of apply (for the driver). Args: output_bundle: The new bundle to be created pipe_params (str): Luigi Task parameters string pipe_cls: String <module.ClassName> force: force recomputation of dependencies input_tags (dict): Tags used to find the input bundle output_tags (dict): Tags that need to be placed on the output bundle force (bool): whether to re-run this pipe output_bundle_uuid (str): Optionally specify exactly the UUID of the output bundle IFF we actually need to produce it central_scheduler: Use a centralized Luigi scheduler (default False, i.e., --local-scheduler is used) workers: The number of luigi workers to use for this workflow (default 1) data_context: Actual context object or None and read current context. incremental_push (bool): Whether this job should push tasks as they complete to the remote (if configured) incremental_pull (bool): Whether this job should localize bundles as needed from the remote (if configured) Returns: bool: True if tasks needed to be run, False if no tasks (beyond wrapper task) executed. """ _logger.debug("driver {}".format(driver.DriverTask)) _logger.debug("pipe_cls {}".format(pipe_cls)) _logger.debug("pipe params: {}".format(pipe_params)) _logger.debug("force: {}".format(force)) _logger.debug("input tags: {}".format(input_tags)) _logger.debug("output tags: {}".format(output_tags)) _logger.debug("sys.path {}".format(sys.path)) _logger.debug("central_scheduler {}".format(central_scheduler)) _logger.debug("workers {}".format(workers)) _logger.debug("incremental_push {}".format(incremental_push)) _logger.debug("incremental_pull {}".format(incremental_pull)) if incremental_push: _logger.warn("incremental_push {}".format(incremental_push)) if incremental_pull: _logger.warn("incremental_pull {}".format(incremental_pull)) pfs = fs.DisdatFS() if data_context is None: if not pfs.in_context(): _logger.warning('Not in a data context') return None data_context = pfs.curr_context # Re-execute logic -- make copy of task DAG # Creates a cache of {pipe:path_cache_entry} in the pipesFS object. # This "task_path_cache" is used throughout execution to find output bundles. reexecute_dag = driver.DriverTask(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force, data_context, incremental_push, incremental_pull) # Get version information for pipeline users_root_task = reexecute_dag.deps()[0] pipeline_path = os.path.dirname(sys.modules[users_root_task.__module__].__file__) fs.DisdatFS().get_pipe_version(pipeline_path) did_work = resolve_workflow_bundles(reexecute_dag, data_context) # At this point the path cache should be full of existing or new UUIDs. # we are going to replace the final pipe's UUID if the user has passed one in. # this happens when we run the docker container. # TODO: don't replace if it already exists. if output_bundle_uuid is not None: users_root_task = reexecute_dag.deps()[0] pce = pfs.get_path_cache(users_root_task) if pce.rerun: # if we have to re-run then replace it with our UUID # TODO: this is the same code as new_output_hframe, FIX!!! dir, uuid, _ = data_context.make_managed_path(output_bundle_uuid) fs.DisdatFS.put_path_cache(users_root_task, uuid, dir, pce.rerun, pce.is_left_edge_task, overwrite=True) success = build([reexecute_dag], local_scheduler=not central_scheduler, workers=workers) # After running a pipeline, blow away our path cache and git hash. Needed if we're run twice in the same process. fs.DisdatFS().clear_pipe_version() fs.DisdatFS().clear_path_cache() return {'success': success, 'did_work': did_work}