Пример #1
0
    def test_resource_provider(self, tempdir):
        pmd = {
            'alias': 'p',
            'service': 'local',
            'format': 'csv',
            'path': tempdir.path
        }
        rmd = {
            'alias': 'r',
            'path': 'abc/def'
        }
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': tempdir.path,
            'provider_alias': 'p',
            'resource_alias': 'r',
            'resource_path': 'abc/def',
            'service': 'local',
            'format': 'csv',
            'url': f'{tempdir.path}/abc/def'
        }
        m = merge(m,u)


        assert(d == m)
Пример #2
0
def metadata_overrides(md, host=None, service=None, port=None, user=None, password=None,
                driver=None, database=None, schema=None, table=None, format=None,
                version=None, hostname=None, username=None, **options):

    d = {}
    d['path'] = md.get('url') or md.get('path')
    d['provider'] = md.get('provider')

    d['host'] = host or hostname or md.get('host') or md.get('hostname')
    d['port'] = port or md.get('port')

    d['service'] = service or md.get('service')
    d['format'] = format or md.get('format')
    d['version'] = version or md.get('version')

    d['user'] =  user or username or md.get('user') or md.get('username')
    d['password'] =  password or md.get('password')

    d['database'] =  database or md.get('database')
    d['schema'] =  schema or md.get('schema')
    d['table'] = table or md.get('table')
    d['driver'] =  driver or md.get('driver')
    d['options'] = merge(md.get('options'), options)

    if database or table:
        d['path'] = None

    return d
Пример #3
0
def Resource(path_or_alias_or_url_or_dict=None, provider_path_or_alias_or_url=None,
        host=None, service=None, port=None, user=None, password=None,
        driver=None, database=None, schema=None, table=None, format=None,
        version=None, hostname=None, username=None, **options):

    prov = provider_path_or_alias_or_url
    res = path_or_alias_or_url_or_dict
    
    if isinstance(res,dict):
        host = host or res.get('host')
        service = service or res.get('service')
        port = port or res.get('port')
        user = user or res.get('user')
        password = password or res.get('password')
        driver = driver or res.get('driver')
        database = database or res.get('database')
        schema = schema or res.get('schema')
        table = table or res.get('table')
        format = format or res.get('format')
        version = version or res.get('version')
        hostname = hostname or res.get('hostname')
        username = username or res.get('username')
        res = res.get('url')
        
        
    # get the resource, by alias metadata or by url
    rmd = to_resource(res, host=host, service=service, port=port,
        user=user, password=password, driver=driver, database=database,
        schema=schema, table=table, format=format, version=version,
        hostname=hostname, username=username, **options)

    # get the provider by reference from the resource, if available
    prov = prov or rmd.get('provider')

    # get the provider, by alias metadata or by url
    pmd = to_resource(prov)

    # check if the provider is a jdbc connection, if so set it
    pmd['database'], pmd['table'], pmd['path'] = path_to_jdbc(pmd, True)

    # merge provider and resource metadata
    md = merge(pmd,rmd)

    # concatenate paths, if no table is defined
    if md['table']:
        md['path'] = None
    else:
        md['path'] = os.path.join(pmd['path'] or '', rmd['path'] or '')

    #process metadata
    md = process_metadata(md)

    #todo: verify resource
    # check format and other minimum requirements are met

    # assemble output
    md = assemble_metadata(md)

    return md
Пример #4
0
    def inherit(self, profiles):
        """
        Profiles inherit from a default profile.
        Inherit merges each profile with the configuration of the default profile.
        :param profiles: dict of profiles
        :return: dict of profiles
        """

        # inherit from default for all other profiles
        for k in profiles.get('default', {}).keys():
            for p in set(profiles.keys()) - {'default'}:
                profiles[p][k] = merge(profiles['default'][k],
                                       profiles[p].get(k))

        return profiles
Пример #5
0
    def test_minimal(self, tempdir):
        pmd = {
            'service': 'local',
            'format': 'csv',
            'path': tempdir.path
        }
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': tempdir.path,
            'service': 'local',
            'format': 'csv',
            'url': f'{tempdir.path}'
        }
        m = merge(m,u)

        assert(d == m)
Пример #6
0
def to_resource(url_alias=None, *args, **kwargs):

    md = None

    # if a dict, create from dictionary
    if isinstance(url_alias, dict):
        md = resource_from_dict(url_alias)

    # if a string, and a metadata profile is loaded, check for aliases
    if metadata.profile():
        if not md and url_alias in metadata.profile().get('resources', {}).keys():
            md = metadata.profile()['resources'][url_alias]

        if not md and url_alias in metadata.profile().get('providers', {}).keys():
            md = metadata.profile()['providers'][url_alias]

    # if nothing found yet, interpret as a urn/path
    if not md and url_alias:
        md = resource_from_urn(urnparse(url_alias))

    # empty default
    if not md:
        md = get_default_md()

    # sanitize path if it's a url or a query
    if md.get('path', None):
        url_md = resource_from_urn(urnparse(md['path']))
        md = merge(url_md, md)
        md['path'] = url_md['path']

    # override using kwargs
    md = metadata_overrides(md, **kwargs)

    if 'hostname' in md:
        del md['hostname']

    if 'username' in md:
        del md['username']

    return md
Пример #7
0
    def read(self, file_paths=None):
        """
        Return all profiles, stored in a nested dictionary
        Profiles are merged over the list provided of provided metadata files to read.
        The order in the list of metadata files determines how profile properties are override
        :param file_paths: list of yaml files paths
        :return: dict of profiles
        """

        # empty profiles, before start reading
        profiles = {}

        if not file_paths:
            file_paths = []

        self._info['files'] = []
        for filename in file_paths:
            if os.path.isfile(filename):
                with open(filename, 'r') as f:
                    try:
                        docs = list(yaml.load_all(f))
                        self._info['files'].append(filename)
                    except yaml.YAMLError as e:
                        if hasattr(e, 'problem_mark'):
                            mark = e.problem_mark
                            logging.error(
                                "Error loading yml file {} at position: (%s:%s): skipping file"
                                .format(filename, mark.line + 1,
                                        mark.column + 1))
                            docs = []
                    finally:
                        for doc in docs:
                            doc['profile'] = doc.get('profile', 'default')
                            profiles[doc['profile']] = merge(
                                profiles.get(doc['profile'], {}), doc)

        self._info['profiles'] = sorted(list(profiles.keys()))

        return profiles
Пример #8
0
    def process(self, msg, kwargs):
        """
        Process the logging message and keyword arguments passed in to
        a logging call to insert contextual information. You can either
        manipulate the message itself, the keyword args or both. Return
        the message and kwargs modified (or not) to suit your needs.
        Normally, you'll only need to override this one method in a
        LoggerAdapter subclass for your specific needs.
        """
        d = self.extra
        d.update({'dfc_funcname': get_scope(5)})

        if isinstance(msg, MutableMapping):
            merged = merge(msg, kwargs.get('extra', {}))
            d.update({'dfc_data': merged})
            msg = 'data'
        elif isinstance(msg, str):
            d.update({'dfc_data': kwargs.get('extra', {})})
        else:
            raise ValueError('log message must be a str or a dict')

        kwargs["extra"] = d
        return msg, kwargs
Пример #9
0
    def load(self,
             profile_name='default',
             metadata_files=None,
             dotenv_path=None,
             parameters=None):
        """
        Load the profile, given a list of yml files and a .env filename
        profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile

        :param profile_name: the profile to load (default: 'default')
        :param metadata_files: a list of metadata files to read
        :param dotenv_path: the path of a dotenv file to read
        :param parameters: optional dict, merged with metadata variables
        :return: the loaded metadata profile dict
        """

        # get metadata by scanning rootdir, if no list is provided
        if metadata_files is None:
            metadata_files = []

            # defaults metadata
            dir_path = os.path.dirname(os.path.realpath(__file__))
            metadata_files += abspath(['schemas/default.yml'], dir_path)

            # project metadata
            metadata_files += abspath(
                files.get_metadata_files(paths.rootdir()), paths.rootdir())

        # get dotenv_path by scanning rootdir, if no dotenv file is provided
        if dotenv_path is None:
            dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()),
                                  paths.rootdir())

        # get env variables from .env file
        if dotenv_path and os.path.isfile(dotenv_path):
            load_dotenv(dotenv_path)

        profiles = self.read(metadata_files)

        # empty profile if profile not found
        if profile_name not in self._info['profiles']:
            self.raiseException(f'Profile "{profile_name}" not found.')

        # read metadata, get the profile, if not found get an empty profile
        profiles = self.inherit(profiles)
        metadata = profiles[profile_name]

        # render any jinja templates in the profile
        md = self.render(metadata)

        # validate
        self.validate(md)

        # format
        md = self.formatted(md)

        # merge parameters from call
        if isinstance(parameters, dict):
            md['variables'] = merge(md['variables'], parameters)

        self._profile = YamlDict(md)
        self._info['active'] = profile_name

        return self
Пример #10
0
    def test_resource_provider_2path_absolute(self, tempdir):
        pmd = {
            'alias': 'p',
            'service': 'local',
            'format': 'csv',
            'path': '/absolute/path'
        }
        rmd = {
            'alias': 'r',
            'path': 'abc/def'
        }
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': '/absolute/path',
            'provider_alias': 'p',
            'resource_alias': 'r',
            'resource_path': 'abc/def',
            'service': 'local',
            'format': 'csv',
            'url': f'/absolute/path/abc/def'
        }
        m = merge(m,u)

        assert(d == m)

        
# resource('SELECT 0 as result where 1 = 0', 'pagila')
# resource('foo.csv', '/bar')
# resource('foo.csv', 'bar')
# resource('foo.csv', 'hdfs')
# resource('/foo.abc', 'hdfs')
# resource('/foo.abc', 'test')
# resource('hello/foo.abc', 'test')
# resource('foo.abc', 'hdfs://*****:*****@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL')
# resource('staff', 'jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL')
# resource('staff', service='mysql', database='sakila', serverTimezone='UTC')
# resource('sakila/staff', service='mysql', serverTimezone='UTC', user='******', password='******')
# resource('foo/bar.tsv', service='s3a')
# resource('/foo/bar.tsv', service='s3a')
# resource('/apples/orange', service='minio')
# resource('SELECT count(*) as cnt from employees;', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', user='******', password='******')
# resource('ascombe')
# resource('ascombe', 'saywhat')
# resource('ascombe', 'hdfs://*****:*****@//123.123.123:345/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name/pass@'], user='******', password='******', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:name@//123.123.123:345/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name@'], user='******', password='', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:@//123.123.123/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'hdfs://123.123.123/schema/database'
# parsed = Urn(scheme=['hdfs'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = '/schema/database'
# parsed = Urn(scheme=[], user='', password='', host='', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 's3a://schema/database'
# parsed = Urn(scheme=['s3a'], user='', password='', host='', port='', path='schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = '1.2.34/schema/database'
# parsed = Urn(scheme=[], user='', password='', host='', port='', path='1.2.34/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'file://1.2.34/schema/database'
# parsed = Urn(scheme=['file'], user='', password='', host='1.2.34', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlite://localdir/a/b/c'
# parsed = Urn(scheme=['jdbc', 'sqlite'], user='', password='', host='', port='', path='localdir/a/b/c', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:@ldap://xyz.acme.com:7777/sales,cn=salesdept,cn=OracleContext,dc=com'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@ldap'], user='', password='', host='xyz.acme.com', port='7777', path='/sales', params=[('cn', 'salesdept'), ('cn', 'OracleContext'), ('dc', 'com')], query='cn=salesdept&cn=OracleContext&dc=com', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'http://xyz.acme.com:7777/foo/bar?a=1&edf=abc#anchor1'
# parsed = Urn(scheme=['http'], user='', password='', host='xyz.acme.com', port='7777', path='/foo/bar', params=[('a', '1'), ('edf', 'abc')], query='a=1&edf=abc', fragment='anchor1')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlserver://localhost:1433;databaseName=AdventureWorks;integratedSecurity=true;'
# parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='1433', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlserver://localhost;databaseName=AdventureWorks;integratedSecurity=true;'
# parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:postgresql://localhost/test?user=fred&password=secret&ssl=false'
# parsed = Urn(scheme=['jdbc', 'postgresql'], user='', password='', host='localhost', port='', path='/test', params=[('user', 'fred'), ('password', 'secret'), ('ssl', 'false')], query='user=fred&password=secret&ssl=false', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:mysql://localhost:3306/youdatabase?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'
# parsed = Urn(scheme=['jdbc', 'mysql'], user='', password='', host='localhost', port='3306', path='/youdatabase', params=[('useSSL', 'false'), ('serverTimezone', 'UTC'), ('zeroDateTimeBehavior', 'CONVERT_TO_NULL')], query='useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', fragment='')

# assert(parsed == urnparse(urn))
Пример #11
0
    def __init__(self,
                 session_name=None,
                 session_id=0,
                 master='local[*]',
                 timezone=None,
                 repositories=None,
                 jars=None,
                 packages=None,
                 files=None,
                 services=None,
                 conf=None,
                 detect=True):

        # call base class
        # stop the previous instance,
        # register self a the new instance
        super().__init__('spark', session_name, session_id)

        # bundle all submit in a dictionary
        self.submit = {
            'jars': [jars] if isinstance(jars, str) else jars or [],
            'packages':
            [packages] if isinstance(packages, str) else packages or [],
            'files': [files] if isinstance(files, str) else files or [],
            'repositories': [repositories]
            if isinstance(repositories, str) else repositories or [],
            'conf': [conf] if isinstance(conf, tuple) else conf or [],
        }

        # suppress INFO logging for java_gateway
        python_logging.getLogger('py4j.java_gateway').setLevel(
            python_logging.ERROR)

        # collect info
        self.set_info()

        # detect packages and configuration from services
        if detect:
            detected = self.detect_submit_params(services)
            self.submit = merge(detected, self.submit)

        # set submit args via env variable
        self.set_submit_args()

        # set other spark-related environment variables
        self.set_env_variables()

        # set spark conf object
        logging.notice(f"Connecting to spark master: {master}")

        conf = pyspark.SparkConf()
        self.set_conf_timezone(conf, timezone)

        # set session name
        conf.setAppName(session_name)

        # set master
        conf.setMaster(master)

        # config passed through the api call go via the config
        for c in self.submit['conf']:
            k, v, *_ = list(c) + ['']
            if isinstance(v, (bool, int, float, str)):
                conf.set(k, v)

        # stop the current session if running
        self.stop()

        # start spark
        self.start_session(conf)
Пример #12
0
def test_merge():
    a = {'a': 1, 'b': 4, 'c': {'merge1': 2}}
    b = {'d': 'add', 'b': 'override', 'c': {'merge2': 4}}
    r1 = merge(a, b)
    r2 = {'a': 1, 'd': 'add', 'b': 'override', 'c': {'merge2': 4, 'merge1': 2}}
    assert (r1 == r2)