예제 #1
0
class PipelinesApi(object):
    def __init__(self, api_client):
        self.client = DeltaPipelinesService(api_client)
        self.dbfs_client = DbfsApi(api_client)

    def create(self, spec, spec_dir, allow_duplicate_names, headers=None):
        data = self._upload_libraries_and_update_spec(spec, spec_dir)
        data['allow_duplicate_names'] = allow_duplicate_names
        return self.client.client.perform_query('POST', '/pipelines', data=data,
                                                headers=headers)

    def deploy(self, spec, spec_dir, allow_duplicate_names, headers=None):
        data = self._upload_libraries_and_update_spec(spec, spec_dir)
        data['allow_duplicate_names'] = allow_duplicate_names
        pipeline_id = data['id']
        self.client.client.perform_query('PUT', '/pipelines/{}'.format(pipeline_id), data=data,
                                         headers=headers)

    def delete(self, pipeline_id, headers=None):
        self.client.delete(pipeline_id, headers)

    def get(self, pipeline_id, headers=None):
        return self.client.get(pipeline_id, headers)

    def list(self, headers=None):
        def call(page_token=None, max_results=None, order_by=None):
            _data = {}
            if page_token:
                _data["page_token"] = page_token
            if max_results:
                _data["max_results"] = max_results
            if order_by:
                _data["order_by"] = order_by

            return self.client.client.perform_query(
                'GET', '/pipelines', data=_data, headers=headers)

        response = call()
        pipelines = response.get("statuses", [])

        while "next_page_token" in response:
            response = call(page_token=response["next_page_token"])
            pipelines.extend(response.get("statuses", []))
        return pipelines

    def start_update(self, pipeline_id, full_refresh=None, headers=None):
        return self.client.start_update(pipeline_id, full_refresh=full_refresh, headers=headers)

    def stop(self, pipeline_id, headers=None):
        self.client.stop(pipeline_id, headers)

    def _upload_libraries_and_update_spec(self, spec, spec_dir):
        spec = copy.deepcopy(spec)
        lib_objects = LibraryObject.from_json(spec.get('libraries', []))
        local_lib_objects, external_lib_objects = self._identify_local_libraries(lib_objects)

        spec['libraries'] = LibraryObject.to_json(
            external_lib_objects + self._upload_local_libraries(spec_dir, local_lib_objects))
        return spec

    @staticmethod
    def _identify_local_libraries(lib_objects):
        """
        Partitions the given set of libraries into local libraries i.e. libraries that should
        be uploaded to DBFS, and non-local libraries. Jars or whls with a file scheme or no scheme
        at all are (currently) considered local libraries.

        :param lib_objects: List[LibraryObject]
        :return: List[List[LibraryObject], List[LibraryObject]] ([Local, External])
        """
        local_lib_objects, external_lib_objects = [], []
        for lib_object in lib_objects:
            if lib_object.lib_type not in ['jar', 'whl']:
                external_lib_objects.append(lib_object)
                continue

            parsed_uri = urllib.parse.urlparse(lib_object.path)
            if parsed_uri.scheme == '':
                local_lib_objects.append(lib_object)
            elif parsed_uri.scheme.lower() == 'file':
                # exactly 1 or 3
                if parsed_uri.path.startswith('//') or parsed_uri.netloc != '':
                    raise RuntimeError('invalid file uri scheme, '
                                       'did you mean to use file:/ or file:///')
                local_lib_objects.append(LibraryObject(lib_object.lib_type, parsed_uri.path))
            else:
                external_lib_objects.append(lib_object)
        return local_lib_objects, external_lib_objects

    def _upload_local_libraries(self, spec_dir, local_lib_objects):
        relative_local_lib_objects = [LibraryObject(llo.lib_type, os.path.join(spec_dir, llo.path))
                                      for llo in local_lib_objects]
        remote_lib_objects = [LibraryObject(rllo.lib_type, self._get_hashed_path(rllo.path))
                              for rllo in relative_local_lib_objects]
        transformed_remote_lib_objects = [LibraryObject(rlo.lib_type, DbfsPath(rlo.path))
                                          for rlo in remote_lib_objects]
        upload_files = [llo_tuple for llo_tuple in
                        zip(relative_local_lib_objects, transformed_remote_lib_objects)
                        if not self.dbfs_client.file_exists(llo_tuple[1].path)]

        for llo, rlo in upload_files:
            self.dbfs_client.put_file(llo.path, rlo.path, False)

        return remote_lib_objects

    @staticmethod
    def _get_hashed_path(path):
        """
        Finds the corresponding dbfs file path for the file located at the supplied path by
        calculating its hash using SHA1.
        :param path: Local File Path
        :return: Remote Path (pipeline_base_dir + file_hash (dot) file_extension)
        """
        hash_buffer = sha1()
        with open(path, 'rb') as f:
            while True:
                data = f.read(BUFFER_SIZE)
                if not data:
                    break
                hash_buffer.update(data)

        file_hash = hash_buffer.hexdigest()
        # splitext includes the period in the extension
        extension = os.path.splitext(path)[1][1:]
        if extension == 'whl':
            # Wheels need to follow the format described in the PEP, so we simply
            # pre-pend the content hash to the wheel_name
            # basename in Python returns the extension as well
            wheel_name = os.path.basename(path)
            path = '{}/{}/{}'.format(base_pipelines_dir, file_hash, wheel_name)
        else:
            path = '{}/{}.{}'.format(base_pipelines_dir, file_hash, extension)
        return path
예제 #2
0
class PipelinesApi(object):
    def __init__(self, api_client):
        self.client = DeltaPipelinesService(api_client)
        self.dbfs_client = DbfsApi(api_client)

    def deploy(self, spec, headers=None):
        lib_objects = LibraryObject.from_json(spec.get('libraries', []))
        local_lib_objects, external_lib_objects = \
            self._identify_local_libraries(lib_objects)

        spec['libraries'] = LibraryObject.to_json(
            external_lib_objects +
            self._upload_local_libraries(local_lib_objects))
        pipeline_id = spec['id']
        self.client.client.perform_query('PUT',
                                         '/pipelines/{}'.format(pipeline_id),
                                         data=spec,
                                         headers=headers)

    def delete(self, pipeline_id, headers=None):
        self.client.delete(pipeline_id, headers)

    def get(self, pipeline_id, headers=None):
        return self.client.get(pipeline_id, headers)

    def reset(self, pipeline_id, headers=None):
        self.client.reset(pipeline_id, headers)

    @staticmethod
    def _identify_local_libraries(lib_objects):
        """
        Partitions the given set of libraries into local and those already present in dbfs/s3 etc.
        Local libraries are (currently) jar files with a file scheme or no scheme at all.
        All other libraries should be present in a supported external source.
        :param lib_objects: List[LibraryObject]
        :return: List[List[LibraryObject], List[LibraryObject]] ([Local, External])
        """
        local_lib_objects, external_lib_objects = [], []
        for lib_object in lib_objects:
            if lib_object.lib_type == 'maven':
                external_lib_objects.append(lib_object)
                continue
            parsed_uri = urllib.parse.urlparse(lib_object.path)
            if lib_object.lib_type in supported_lib_types and parsed_uri.scheme == '':
                local_lib_objects.append(lib_object)
            elif lib_object.lib_type in supported_lib_types and parsed_uri.scheme.lower(
            ) == 'file':
                # exactly 1 or 3
                if parsed_uri.path.startswith('//') or parsed_uri.netloc != '':
                    raise RuntimeError(
                        'invalid file uri scheme, '
                        'did you mean to use file:/ or file:///')
                local_lib_objects.append(
                    LibraryObject(lib_object.lib_type, parsed_uri.path))
            else:
                external_lib_objects.append(lib_object)
        return local_lib_objects, external_lib_objects

    def _upload_local_libraries(self, local_lib_objects):
        remote_lib_objects = [
            LibraryObject(llo.lib_type, self._get_hashed_path(llo.path))
            for llo in local_lib_objects
        ]

        transformed_remote_lib_objects = [
            LibraryObject(rlo.lib_type, DbfsPath(rlo.path))
            for rlo in remote_lib_objects
        ]
        upload_files = [
            llo_tuple for llo_tuple in zip(local_lib_objects,
                                           transformed_remote_lib_objects)
            if not self.dbfs_client.file_exists(llo_tuple[1].path)
        ]

        for llo, rlo in upload_files:
            self.dbfs_client.put_file(llo.path, rlo.path, False)

        return remote_lib_objects

    @staticmethod
    def _get_hashed_path(path):
        """
        Finds the corresponding dbfs file path for the file located at the supplied path by
        calculating its hash using SHA1.
        :param path: Local File Path
        :return: Remote Path (pipeline_base_dir + file_hash (dot) file_extension)
        """
        hash_buffer = sha1()
        with open(path, 'rb') as f:
            while True:
                data = f.read(BUFFER_SIZE)
                if not data:
                    break
                hash_buffer.update(data)

        file_hash = hash_buffer.hexdigest()
        # splitext includes the period in the extension
        extension = os.path.splitext(path)[1][1:]
        if extension == 'whl':
            # Wheels need to follow the format described in the PEP, so we simply
            # pre-pend the content hash to the wheel_name
            # basename in Python returns the extension as well
            wheel_name = os.path.basename(path)
            path = '{}/{}/{}'.format(base_pipelines_dir, file_hash, wheel_name)
        else:
            path = '{}/{}.{}'.format(base_pipelines_dir, file_hash, extension)
        return path