예제 #1
0
    def load(self,
             profile_name='default',
             metadata_files=None,
             dotenv_path=None):
        """
        Load the profile, given a list of yml files and a .env filename
        profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile

        :param profile_name: the profile to load (default: 'default')
        :param metadata_files: a list of metadata files to read
        :param dotenv_path: the path of a dotenv file to read
        :return: the loaded metadata profile dict
        """

        # get metadata by scanning rootdir, if no list is provided
        if metadata_files is None:
            metadata_files = []

            # defaults metadata
            dir_path = os.path.dirname(os.path.realpath(__file__))
            metadata_files += abspath(['schemas/default.yml'], dir_path)

            # project metadata
            metadata_files += abspath(
                files.get_metadata_files(paths.rootdir()), paths.rootdir())

        # get dotenv_path by scanning rootdir, if no dotenv file is provided
        if dotenv_path is None:
            dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()),
                                  paths.rootdir())

        # get env variables from .env file
        if dotenv_path and os.path.isfile(dotenv_path):
            load_dotenv(dotenv_path)

        profiles = self.read(metadata_files)

        # empty profile if profile not found
        if profile_name not in self._info['profiles']:
            self.raiseException(f'Profile "{profile_name}" not found.')

        # read metadata, get the profile, if not found get an empty profile
        profiles = self.inherit(profiles)
        metadata = profiles[profile_name]

        # render any jinja templates in the profile
        md = self.render(metadata)

        # validate
        self.validate(md)

        # format
        md = self.formatted(md)

        self._profile = YamlDict(md)
        self._info['active'] = profile_name
예제 #2
0
    def info(self):
        if not self.loaded:
            logging.error("No project profile loaded. " +
                          "Execute datafaucet.project.load(...) first.")
            return None

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_name': self._session_name,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
예제 #3
0
def init_adapter(logger=None, sid=None):
    sid = sid or hex(uuid.uuid1().int >> 64)
    username = getpass.getuser()
    filepath = files.get_script_path(paths.rootdir())

    repo = git.repo_data()
    reponame = repo['name']
    repohash = repo['hash']

    # configure context
    extra = {
        'dfc_sid': sid,
        'dfc_repohash': repohash,
        'dfc_reponame': reponame,
        'dfc_username': username,
        'dfc_filepath': filepath
    }

    # setup adapter
    return LoggerAdapter(logger, extra)
예제 #4
0
def process_metadata(md):

    # update format from
    md['format'] = get_format(md)

    # if no service, at this point use file
    md['service'] = md['service'] or 'file'

    # standardize some service names
    services = {'minio': 's3a', 'local': 'file'}
    md['service'] = services.get(md['service'], md['service'])

    # if no host, use localhost
    md['host'] = md['host'] or '127.0.0.1'

    # if local file system and rel path, prepend rootdir
    if md['service'] in ['file', 'sqlite'] and not os.path.isabs(md['path']):
        md['path'] = os.path.join(rootdir(), md['path'])

    # if service is s3a, remove leading '/'
    if md['service'] == 's3a' and md['path']:
        md['path'] = md['path'].lstrip('/')

    # generate database, table from path
    if md['format'] == 'jdbc':
        md['database'], md['table'], md['path'] = path_to_jdbc(md)

        # set driver
        md['driver'] = md['driver'] or get_driver(md['service'])

        # if schema is not yet defined,
        # take the default for each service
        default_schemas = {
            'mysql': md['database'],
            'mssql': 'dbo',
            'postgres': 'public',
            'clickhouse': 'default',
            'oracle': md['user']
        }

        md['schema'] = md['schema'] or default_schemas.get(md['service'])

        query = get_sql_query(md['table'])
        if query and not query.endswith('as _query'):
            md['table'] = '( {} ) as _query'.format(query)

    md['version'] = md['version'] or get_version(md['service'])

    md['port'] = md['port'] or get_port(md['service'])
    md['port'] = int(md['port']) if md['port'] else None
    md['url'] = get_url(md)

    if not isinstance(md['options'], dict):
        md['options'] = {}

    compression = get_compression(md['path'])
    if md['format'] != 'jdbc' and compression:
        md['options']['compression'] = compression

    h_list = []
    for k in ['url', 'format', 'table', 'database']:
        v = zlib.crc32(md[k].encode()) if md[k] else 0
        h_list.append(v)

    md['hash'] = functools.reduce(lambda a, b: a ^ b, h_list)
    md['hash'] = hex(ctypes.c_size_t(md['hash']).value)

    return md
예제 #5
0
    def load(self, profile='default', rootpath=None):
        """
        Performs the following steps:
            - set rootdir for the given project
            - import variables from  <rootdir>/.env (if present),
            - load the `profile` from the metadata files
            - setup and start the data engine

        :param profile: load the given metadata profile (default: 'default')
        
        :param rootpath: root directory for loaded project 
               default behaviour: search parent dirs to detect rootdir by 
               looking for a '__main__.py' or 'main.ipynb' file. 
               When such a file is found, the corresponding directory is the 
               root path for the project. If nothing is found, the current 
               working directory, will be the rootpath

        :return: None

        Notes abount metadata configuration:

        1)  Metadata files are merged up, so you can split the information in 
            multiple files as long as they end with `metadata.yml`. 

            For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` 
            are all valid metadata file names.

        2)  All metadata files in all subdirectories from the project root directory 
            are loaded, unless the directory contains a file `metadata.ignore.yml`

        3)  Metadata files can provide multiple profile configurations,
            by separating each profile configuration with a Document Marker 
            ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML)

        4)  Each metadata profile, can be broken down in multiple yaml files,
            When loading the files all configuration belonging to the same profile 
            with be merged.

        5)  All metadata profiles inherit the settings from profile 'default'

        Metadata files are composed of 6 sections:
            - profile
            - variables
            - providers
            - resources
            - engine
            - loggers

        For more information about metadata configuration,
        type `help(datafaucet.project.metadata)`    
        """

        if self.loaded and self._no_reload:
            logging.notice(f"Profile {self._profile} already loaded. "
                           "Skipping project.load()")
            return self

        # set rootpath
        paths.set_rootdir(rootpath)

        # set loaded to false
        self.loaded = False

        # set username
        self._username = getpass.getuser()

        # get repo data
        self._repo = repo_data()

        # set session name
        L = [self._profile, self._repo.get('name')]
        self._session_name = '-'.join([x for x in L if x])

        # set session id
        self._session_id = hex(uuid.uuid1().int >> 64)

        # get currently running script path
        self._script_path = files.get_script_path(paths.rootdir())

        # set dotenv default file, check the file exists
        self._dotenv_path = files.get_dotenv_path(paths.rootdir())

        # get files
        self._metadata_files = files.get_metadata_files(paths.rootdir())
        self._notebook_files = files.get_jupyter_notebook_files(
            paths.rootdir())
        self._python_files = files.get_python_files(paths.rootdir())

        # metadata defaults
        dir_path = os.path.dirname(os.path.realpath(__file__))
        default_md_files = [os.path.join(dir_path, 'schemas/default.yml')]
        project_md_files = abspath(self._metadata_files, paths.rootdir())

        # load metadata
        try:
            md_paths = default_md_files + project_md_files
            dotenv_path = abspath(self._dotenv_path, paths.rootdir())

            metadata.load(profile, md_paths, dotenv_path)
        except ValueError as e:
            print(e)

        # bail if no metadata
        if metadata.profile is None:
            raise ValueError('No valid metadata to load.')

        # set profile from metadata
        self._profile_name = metadata.info()['active']

        # add roothpath to the list of python sys paths
        if paths.rootdir() not in sys.path:
            sys.path.append(paths.rootdir())

        # stop existing engine
        if self._engine:
            self._engine.stop()

        #services
        services = dict()

        all_aliases = list(metadata.profile()['providers'].keys())

        # get services from aliases
        for alias in all_aliases:
            r = Resource(alias)
            services[r['service']] = r

        # get one service from each type to
        # load drivers, jars etc via the engine init
        services = list(services.values())

        #initialize the engine
        md = metadata.profile()['engine']
        engines.Engine(md['type'],
                       session_name=self._session_name,
                       session_id=self._session_id,
                       master=md['master'],
                       timezone=md['timezone'],
                       jars=md['submit']['jars'],
                       packages=md['submit']['packages'],
                       pyfiles=md['submit']['py-files'],
                       files=md['submit']['files'],
                       repositories=md['submit']['repositories'],
                       conf=md['submit']['conf'],
                       services=services)

        # initialize logging
        logging.init(metadata.profile()['loggers'], self._session_id,
                     self._username, self._script_path, self._repo['name'],
                     self._repo['hash'])

        # set loaded to True
        self.loaded = True

        # return object
        return self