示例#1
0
    def list(self, provider):
        if isinstance(provider, YamlDict):
            md = provider.to_dict()
        elif isinstance(provider, str):
            md = get_metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(f'{str(provider)} cannot be used to reference a provider')
            return []

        try:
            if md['service'] in ['local', 'file']:
                d = []
                for f in os.listdir(md['provider_path']):
                    d.append(os.path.join(md['provider_path'], f))
                return d
            elif md['service'] == 'hdfs':
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration())

                obj = fs.listStatus(Path(md['url']))
                tables = [obj[i].getPath().getName() for i in range(len(obj))]
                return tables

            elif md['format'] == 'jdbc':
                if md['service'] == 'mssql':
                    query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query"
                elif md['service'] == 'oracle':
                    query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query"
                elif md['service'] == 'mysql':
                    query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query"
                elif md['service'] == 'pgsql':
                    query = f"(SELECT table_name FROM information_schema.tables) as query"
                else:
                    # vanilla query ... for other databases
                    query = f"(SELECT table_name FROM information_schema.tables) as query"

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()]
            else:
                logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'})
                return []
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return []
示例#2
0
def read(file_paths=None):
    """
    Return all profiles, stored in a nested dictionary
    profiles are merged over the list provided profiles. list order determines override
    each profile name
    :param file_paths: list of yaml files
    :return: dict of profiles
    """
    profiles = {}

    if not file_paths:
        file_paths = []

    for filename in file_paths:
        if os.path.isfile(filename):
            with open(filename, 'r') as f:
                try:
                    docs = list(yaml.load_all(f))
                except yaml.YAMLError as e:
                    if hasattr(e, 'problem_mark'):
                        mark = e.problem_mark
                        logging.error(
                            "Error loading yml file {} at position: (%s:%s): skipping file"
                            .format(filename, mark.line + 1, mark.column + 1))
                        docs = []
                finally:
                    for doc in docs:
                        doc['profile'] = doc.get('profile', 'default')
                        profiles[doc['profile']] = merge(
                            profiles.get(doc['profile'], {}), doc)

    return profiles
示例#3
0
    def save(self,
             obj,
             path=None,
             provider=None,
             *args,
             format=None,
             mode=None,
             **kwargs):

        md = Resource(path, provider, format=format, mode=mode, **kwargs)

        if md['format'] == 'csv':
            return self.save_csv(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'tsv':
            kwargs['sep'] = '\t'
            return self.save_csv(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'json':
            return self.save_json(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'jsonl':
            return self.save_json(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'parquet':
            return self.save_parquet(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'jdbc':
            return self.save_jdbc(obj, path, provider, mode=mode, **kwargs)
        else:
            logging.error(f'Unknown format "{md["service"]}"',
                          extra={'md': md})
            return False
示例#4
0
def read(file_paths=None):
    """
    Return all profiles, stored in a nested dictionary
    Profiles are merged over the list provided of provided metadata files to read. 
    The order in the list of metadata files determines how profile properties are override
    :param file_paths: list of yaml files paths
    :return: dict of profiles
    """
    global loaded_md_files, profiles
    
    # empty profiles, before start reading 
    profiles = {}

    if not file_paths:
        file_paths = []
    
    loaded_md_files = []
    for filename in file_paths:
        if os.path.isfile(filename):
            with open(filename, 'r') as f:
                try:
                    docs = list(yaml.load_all(f))
                    loaded_md_files.append(filename)
                except yaml.YAMLError as e:
                    if hasattr(e, 'problem_mark'):
                        mark = e.problem_mark
                        logging.error("Error loading yml file {} at position: (%s:%s): skipping file".format(filename, mark.line+1, mark.column+1))
                        docs = []
                finally:
                    for doc in docs:
                        doc['profile'] = doc.get('profile', 'default')
                        profiles[doc['profile']] = merge(profiles.get(doc['profile'],{}), doc)

    return profiles
示例#5
0
    def start_context(self, conf):
        try:
            # init the spark session
            session = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate()
            
            # fix SQLContext for back compatibility
            self.initialize_spark_sql_context(session,session.sparkContext)

            # pyspark set log level method
            # (this will not suppress WARN before starting the context)
            session.sparkContext.setLogLevel("ERROR")
            return session
        except Exception as e:
            logging.error('Could not start the engine context')
            return None
示例#6
0
    def load(self, path=None, provider=None, *args, format=None, **kwargs):

        md = Resource(path, provider, format=format, **kwargs)

        if md['format'] == 'csv':
            return self.load_csv(path, provider, **kwargs)
        elif md['format'] == 'json':
            return self.load_json(path, provider, **kwargs)
        elif md['format'] == 'parquet':
            return self.load_parquet(path, provider, **kwargs)
        elif md['format'] == 'jdbc':
            return self.load_jdbc(path, provider, **kwargs)
        else:
            logging.error(f'Unknown resource format "{md["format"]}"',
                          extra={'md': to_dict(md)})
        return None
示例#7
0
    def save(self, obj, path=None, provider=None, **kargs):

        if isinstance(path, YamlDict):
            md = path.to_dict()
        elif isinstance(path, str):
            md = resource.metadata(self._rootdir, self._metadata, path, provider)
        elif isinstance(path, dict):
            md = path

        prep_start = timer()
        options = md['options'] or {}
        
        if md['date_partition'] and md['date_column']:
            tzone = 'UTC' if self._timestamps == 'naive' else self._timezone
            obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone)
            kargs['partitionBy'] = ['_date'] + kargs.get('partitionBy', options.get('partitionBy', []))

        if md['update_column']:
            obj = dataframe.add_update_column(obj, tzone=self._timezone)

        if md['hash_column']:
            obj = dataframe.add_hash_column(obj, cols=md['hash_column'],
                                            exclude_cols=['_date', '_datetime', '_updated', '_hash', '_state'])

        date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(
            obj,
            date_column,
            md['date_start'],
            md['date_end'],
            md['date_window'])

        obj = dataframe.cache(obj, md['cache'])

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # force 1 file per partition, just before saving
        obj = obj.repartition(1, *kargs['partitionBy']) if kargs.get('partitionBy') else obj.repartition(1)
        # obj = obj.coalesce(1)

        prep_end = timer()

        core_start = timer()
        result = self.save_dataframe(obj, md, **kargs)
        core_end = timer()

        log_data = {
            'md': {i: md[i] for i in md if i != 'password'},
            'mode': kargs.get('mode', options.get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': core_end - prep_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }

        logging.info(log_data) if result else logging.error(log_data)

        return result
示例#8
0
    def info(self):
        if not self.loaded:
            logging.error("No project profile loaded. " +
                          "Execute datalabframework.project.load(...) first.")
            return None

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_name': self._session_name,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
示例#9
0
def Engine(engine_type=None, *args, **kwargs):
    global _engines

    if engine_type:
        if engine_type in _engines.keys():
            cls = _engines[engine_type]
            cls(*args, **kwargs)
        else:
            print('Could not create the Engine:')
            print('No matching engine type in', ', '.join(_engines.keys()))

    engine = _singleton['instance']

    if not engine:
        logging.error(
            'No Engine running yet. \n'
            'try datalabframework.engine(...) or datalabframework.project.load(...)'
        )

    return engine
示例#10
0
    def load_jdbc(self, path=None, provider=None, *args, **kwargs):
        #return None
        obj = None

        md = Resource(path, provider, format='jdbc', **kwargs)

        options = md['options']

        try:
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'oracle'
            ]:
                obj = self.context.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .options(**options)
                # load the data from jdbc
                obj = obj.load(**kwargs)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        return obj
示例#11
0
    def load(self, path=None, provider=None, catch_exception=True, **kargs):
        if isinstance(path, YamlDict):
            md = path.to_dict()
        elif isinstance(path, str):
            md = get_metadata(self._rootdir, self._metadata, path, provider)
        elif isinstance(path, dict):
            md = path

        core_start = timer()
        obj = self.load_dataframe(md, catch_exception, **kargs)
        core_end = timer()
        if obj is None:
            return obj

        prep_start = timer()
        date_column = '_date' if md['date_partition'] else md['date_column']
                
        obj = dataframe.filter_by_date(
            obj,
            date_column,
            md['date_start'],
            md['date_end'],
            md['date_window'])

        # partition and sorting (hmmm, needed?)
        if date_column and date_column in obj.columns:
            obj = obj.repartition(date_column)

        if '_updated' in obj.columns:
            obj = obj.sortWithinPartitions(F.desc('_updated'))

        num_rows = obj.count()
        num_cols = len(obj.columns)

        obj = dataframe.cache(obj, md['cache'])

        prep_end = timer()

        log_data = {
            'md': dict(md),
            'mode': kargs.get('mode', md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': prep_end - core_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }
        logging.info(log_data) if obj is not None else logging.error(log_data)

        return obj
示例#12
0
    def load_plus(self,
                  path=None,
                  provider=None,
                  catch_exception=True,
                  **kwargs):
        md = Resource(path, provider, **kwargs)

        core_start = timer()
        obj = self.load_dataframe(md, catch_exception, **kwargs)
        core_end = timer()
        if obj is None:
            return obj

        prep_start = timer()
        #date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(obj, date_column, md['date_start'],
                                       md['date_end'], md['date_window'])

        # partition and sorting (hmmm, needed?)
        if date_column and date_column in obj.columns:
            obj = obj.repartition(date_column)

        if '_updated' in obj.columns:
            obj = obj.sortWithinPartitions(F.desc('_updated'))

        num_rows = obj.count()
        num_cols = len(obj.columns)

        obj = dataframe.cache(obj, md['cache'])

        prep_end = timer()

        log_data = {
            'md': md,
            'mode': kwargs.get('mode',
                               md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': prep_end - core_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }
        logging.info(log_data) if obj is not None else logging.error(log_data)

        obj.__name__ = path
        return obj
示例#13
0
    def save_parquet(self,
                     obj,
                     path=None,
                     provider=None,
                     *args,
                     mode=None,
                     **kwargs):

        md = Resource(path, provider, format='parquet', mode=mode, **kwargs)
        options = md['options']

        # after collecting from metadata, or method call, define defaults
        options['mode'] = options['mode'] or 'overwrite'

        local = self.is_spark_local()

        try:
            #three approaches: file-local, local+cluster, and service
            if md['service'] == 'file' and local:
                obj.coalesce(1).write\
                    .format('parquet')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .parquet(md['url'])

            elif md['service'] == 'file':
                if os.path.exists(md['url']) and os.path.isdir(md['url']):
                    shutil.rmtree(md['url'])

                # save with pandas
                obj.toPandas().to_parquet(md['url'], mode=options['mode'])

            elif md['service'] in ['hdfs', 's3a']:
                obj.write\
                     .format('parquet')\
                     .mode(options['mode'])\
                     .options(**options)\
                     .parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
示例#14
0
    def load_csv(self,
                 path=None,
                 provider=None,
                 *args,
                 sep=None,
                 header=None,
                 **kwargs):

        #return None
        obj = None

        md = Resource(path, provider, sep=sep, header=header, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header') or True
        options['inferSchema'] = options.get('inferSchema') or True
        options['sep'] = options.get('sep') or ','

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).csv(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})

                df = pd.read_csv(md['url'],
                                 sep=options['sep'],
                                 header=options['header'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).csv(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        return obj
示例#15
0
    def load_parquet(self,
                     path=None,
                     provider=None,
                     *args,
                     mergeSchema=None,
                     **kwargs):

        #return None
        obj = None

        md = Resource(path,
                      provider,
                      format='parquet',
                      mergeSchema=mergeSchema,
                      **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mergeSchema'] = options.get('mergeSchema') or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).parquet(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})
                #fallback to the pandas reader, then convert to spark
                df = pd.read_parquet(md['url'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        return obj
示例#16
0
    def save_jdbc(self,
                  obj,
                  path=None,
                  provider=None,
                  *args,
                  mode=None,
                  **kwargs):
        md = Resource(path, provider, format='jdbc', mode=mode, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options['mode'] or 'overwrite'

        try:
            #three approaches: local, cluster, and service
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'oracle'
            ]:
                obj.write \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .options(**options) \
                    .mode(options['mode'])\
                    .save()
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
示例#17
0
    def copy(self, md_src, md_trg, mode='append'):

        # timer
        timer_start = timer()

        # src dataframe
        df_src = self.load(md_src)

        # logging
        log_data = {
            'src_hash': md_src['hash'],
            'src_path': md_src['resource_path'],
            'trg_hash': md_trg['hash'],
            'trg_path': md_trg['resource_path'],
            'mode': mode,
            'updated': False,
            'records_read': 0,
            'records_add': 0,
            'records_del': 0,
            'columns': 0,
            'time': timer() - timer_start
        }

        # could not read source, log error and return
        if df_src is None:
            logging.error(log_data)
            return

        num_rows = df_src.count()
        num_cols = len(df_src.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            log_data['time'] = timer() - timer_start
            logging.notice(log_data)
            return

        # overwrite target, save, log notice/error and return
        if mode == 'overwrite':
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))
            if md_trg['version_column']:
                df_src = dataframe.add_version_column(df_src, tzone=self._timezone)
            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # trg dataframe (if exists)
        try:
            if md_trg['version_column']:
                df_trg = self.load_cdc(md_trg, catch_exception=False)
            else:
                df_trg = self.load(md_trg, catch_exception=False)
        except:
            df_trg = dataframe.empty(df_src)

        # if there is schema change, create new version, log notice/error and return
        if not dataframe.compare_schema(df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state', '_version']):
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))
            if md_trg['version_column']:
                df_src = dataframe.add_version_column(df_src, tzone=self._timezone)
            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # de-dup (exclude the _updated column)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg)
                               
        # capture added records
        df_add = dataframe.diff(df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state', '_version'])
        rows_add = df_add.count()

        # capture deleted records
        rows_del = 0
        if md_trg['state_column']:
            df_del = dataframe.diff(df_trg, df_src, ['_date', '_datetime', '_updated', '_hash', '_state', '_version'])
            rows_del = df_del.count()

        updated = (rows_add + rows_del) > 0

        num_cols = len(df_add.columns)
        num_rows = max(df_src.count(), df_trg.count())

        # save diff
        if updated:
            if md_trg['state_column']:
                df_add = df_add.withColumn('_state', F.lit(0))
                df_del = df_del.withColumn('_state', F.lit(1))

                df = df_add.union(df_del)
            else:
                df = df_add
            
            if md_trg['version_column']:
                version = self.find_version(md=md_trg)
                date = datetime.strptime(version, '%Y-%m-%d-%H-%M-%S') if version else None
                df = dataframe.add_version_column(df, version_time=date, tzone=self._timezone)

            result = self.save(df, md_trg, mode=mode)
        else:
            result = True

        log_data.update({
            'updated': updated,
            'records_read': num_rows,
            'records_add': rows_add,
            'records_del': rows_del,
            'columns': num_cols,
            'time': timer() - timer_start
        })

        logging.notice(log_data) if result else logging.error(log_data)
示例#18
0
    def save_dataframe(self, obj, md, **kargs):

        options = md.get('options', {})

        try:
            if md['service'] in ['local', 'file']:
                if md['format'] == 'csv':
                    try:
                        obj.write.options(**options).csv(md['url'], **kargs)
                    except:
                        obj.toPandas().to_csv(md['url'], **kargs)
                elif md['format'] == 'json':
                    try:
                        obj.write.options(**options).json(md['url'], **kargs)
                    except:
                        obj.toPandas().to_json(md['url'], **kargs)
                elif md['format'] == 'jsonl':
                    try:
                        obj.write.options(**options).option('multiLine', True).json(md['url'], **kargs)
                    except:
                        obj.toPandas().to_json(md['url'], orient='records', lines=True, **kargs)
                elif md['format'] == 'parquet':
                    try:
                        obj.write.options(**options).parquet(md['url'], **kargs)
                    except:
                        obj.toPandas().to_parquet(md['url'], orient='records', lines=True, **kargs)
                else:
                    logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'})
                    return False

            elif md['service'] in ['hdfs', 'minio']:
                if md['format'] == 'csv':
                    obj.write.options(**options).csv(md['url'], **kargs)
                elif md['format'] == 'json':
                    obj.write.options(**options).json(md['url'], **kargs)
                elif md['format'] == 'jsonl':
                    obj.write.options(**options).option('multiLine', True).json(md['url'], **kargs)
                elif md['format'] == 'parquet':
                    obj.write.options(**options).parquet(md['url'], **kargs)
                else:
                    logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'})
                    return False

            elif md['service'] in ['sqlite', 'mysql', 'postgres', 'oracle']:
                obj.write \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['resource_path']) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .options(**options) \
                    .save(**kargs)
                                   
            elif md['service'] == 'mongodb':
                obj.write \
                    .format(md['format']) \
                    .option('spark.mongodb.input.uri', md['url'] + '.' + md['resource_path']) \
                    .options(**options)\
                    .save(**kargs)               

            elif md['service'] == 'elastic':
                mode = kargs.get("mode", None)
                obj = [row.asDict() for row in obj.collect()]
                elastic.write(obj, md['url'], mode, md['resource_path'], options['settings'], options['mappings'])
            else:
                logging.error({'md': md, 'error_msg': f'Unknown service "{md["service"]}"'})
                return False
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
示例#19
0
    def load_dataframe(self, md, catch_exception=True, **kargs):
        obj = None
        options = md['options']

        try:
            if md['service'] in ['local', 'file']:
                if md['format'] == 'csv':
                    try:
                        obj = self._ctx.read.options(**options).csv(md['url'], **kargs)
                    except:
                        obj = self._ctx.createDataFrame(pd.read_csv(md['url'], **kargs))

                elif md['format'] == 'json':
                    try:
                        obj = self._ctx.read.options(**options).json(md['url'], **kargs)
                    except:
                        obj = self._ctx.createDataFrame(pd.read_json(md['url'], **kargs))
                elif md['format'] == 'jsonl':
                    try:
                        obj = self._ctx.read.option('multiLine', True).options(**options).json(md['url'], **kargs)
                    except:
                        obj = self._ctx.createDataFrame(pd.read_json(md['url'], lines=True, **kargs))
                elif md['format'] == 'parquet':
                    try:
                        obj = self._ctx.read.options(**options).parquet(md['url'], **kargs)
                    except:
                        obj = self._ctx.createDataFrame(pd.read_parquet(md['url'], **kargs))
                else:
                    logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'})
                    return None

            elif md['service'] in ['hdfs', 'minio']:
                if md['format'] == 'csv':
                    obj = self._ctx.read.options(**options).csv(md['url'], **kargs)
                elif md['format'] == 'json':
                    obj = self._ctx.read.options(**options).json(md['url'], **kargs)
                elif md['format'] == 'jsonl':
                    obj = self._ctx.read.option('multiLine', True).options(**options).json(md['url'], **kargs)
                elif md['format'] == 'parquet':
                    obj = self._ctx.read.options(**options).parquet(md['url'], **kargs)
                else:
                    logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'})
                    return None

            elif md['service'] in ['sqlite', 'mysql', 'postgres', 'mssql', 'oracle']:

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['resource_path']) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .options(**options)

                # load the data from jdbc
                obj = obj.load(**kargs)
                
                                   
            elif md['service'] == 'mongodb':
                obj = self._ctx.read \
                    .format(md['format']) \
                    .option('spark.mongodb.input.uri', md['url'] + '.' + md['resource_path']) \
                    .options(**options)
                                   
                # load the data                
                obj = obj.load(**kargs)
                                   
            elif md['service'] == 'elastic':
                results = elastic.read(md['url'], options.get('query', {}))
                rows = [pyspark.sql.Row(**r) for r in results]
                obj = self.context().createDataFrame(rows)
            else:
                logging.error({'md': md, 'error_msg': f'Unknown service "{md["service"]}"'})
        except Exception as e:
            if catch_exception:
                logging.error({'md': md, 'error': str(e)})
                return None
            else:
                raise e

        return obj
示例#20
0
    def list(self, provider, path=''):

        df_schema = T.StructType([
                T.StructField('name',T.StringType(),True),
                T.StructField('type',T.StringType(),True)])

        df_empty = self._ctx.createDataFrame(data=(), schema=df_schema)
                      
        if isinstance(provider, YamlDict):
            md = provider.to_dict()
        elif isinstance(provider, str):
            md = resource.metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(f'{str(provider)} cannot be used to reference a provider')
            return df_empty

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = os.path.join(md['provider_path'], path)
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type='FILE'
                    elif os.path.isdir(fullpath):
                        obj_type='DIRECTORY'
                    elif os.path.ismount(fullpath):
                        obj_type='LINK'
                    elif os.path.islink(fullpath):
                        obj_type='MOUNT'
                    else:
                        obj_type='UNDEFINED'
                
                    obj_name = f
                    lst += [(obj_name, obj_type)]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            elif md['service'] in ['hdfs', 'minio']:
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration())

                provider_path = md['provider_path'] if  md['service']=='hdfs' else '/'
                obj = fs.listStatus(Path(os.path.join(provider_path, path)))
                
                lst = []
                
                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type='FILE'
                    elif obj[i].isDirectory():
                        obj_type='DIRECTORY'
                    else:
                        obj_type='UNDEFINED'
                
                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            elif md['format'] == 'jdbc':
                # remove options from database, if any
                database = md["database"].split('?')[0]
                schema = md['schema']
                if md['service'] == 'mssql':
                    query = f"""
                        ( SELECT table_name, table_type 
                          FROM INFORMATION_SCHEMA.TABLES 
                          WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'oracle':
                    query = f"""
                        ( SELECT table_name, table_type 
                         FROM all_tables 
                         WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'mysql':
                    query = f"""
                        ( SELECT table_name, table_type 
                          FROM information_schema.tables 
                          WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'postgres':
                    query = f"""
                        ( SELECT table_name, table_type
                          FROM information_schema.tables 
                          WHERE table_schema = '{schema}'
                        ) as query
                        """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                            ( SELECT table_name, table_type 
                              FROM information_schema.tables'
                            ) as query
                            """

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = [(x.TABLE_NAME, x.TABLE_TYPE) for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect()]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            else:
                logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'})
                return  df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return  df_empty
示例#21
0
    def save_json(self,
                  obj,
                  path=None,
                  provider=None,
                  *args,
                  mode=None,
                  lines=None,
                  **kwargs):

        md = Resource(path,
                      provider,
                      format='csv',
                      mode=mode,
                      lines=lines,
                      **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options['mode'] or 'overwrite'
        options['lines'] = options['lines'] or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if local and md['service'] == 'file' and options['lines']:
                obj.coalesce(1).write\
                    .format('json')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .json(md['url'])
                self.directory_to_file(md['url'])

            elif md['service'] == 'file':
                # fallback, use pandas
                # save single files, not directories
                if os.path.exists(md['url']) and os.path.isdir(md['url']):
                    shutil.rmtree(md['url'])

                # save with pandas
                obj.toPandas().to_json(md['url'],
                                       mode=options['mode'],
                                       lines=options['lines'])

            elif md['service'] in ['hdfs', 's3a']:
                obj.write\
                    .format('json')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .json(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
示例#22
0
    def copy(self, md_src, md_trg, mode='append'):
        # timer
        timer_start = timer()

        # src dataframe
        df_src = self.load(md_src)

        # if not path on target, get it from src
        if not md_trg['resource_path']:
            md_trg = resource.metadata(self._rootdir, self._metadata,
                                       md_src['resource_path'],
                                       md_trg['provider_alias'])

        # logging
        log_data = {
            'src_hash': md_src['hash'],
            'src_path': md_src['resource_path'],
            'trg_hash': md_trg['hash'],
            'trg_path': md_trg['resource_path'],
            'mode': mode,
            'updated': False,
            'records_read': 0,
            'records_add': 0,
            'records_del': 0,
            'columns': 0,
            'time': timer() - timer_start
        }

        # could not read source, log error and return
        if df_src is None:
            logging.error(log_data)
            return

        num_rows = df_src.count()
        num_cols = len(df_src.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            log_data['time'] = timer() - timer_start
            logging.notice(log_data)
            return

        # overwrite target, save, log notice/error and return
        if mode == 'overwrite':
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))

            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # trg dataframe (if exists)
        try:
            df_trg = self.load(md_trg, catch_exception=False)
        except:
            df_trg = dataframe.empty(df_src)

        # de-dup (exclude the _updated column)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg)

        # capture added records
        df_add = dataframe.diff(
            df_src, df_trg,
            ['_date', '_datetime', '_updated', '_hash', '_state'])
        rows_add = df_add.count()

        # capture deleted records
        rows_del = 0
        if md_trg['state_column']:
            df_del = dataframe.diff(
                df_trg, df_src,
                ['_date', '_datetime', '_updated', '_hash', '_state'])
            rows_del = df_del.count()

        updated = (rows_add + rows_del) > 0

        num_cols = len(df_add.columns)
        num_rows = max(df_src.count(), df_trg.count())

        # save diff
        if updated:
            if md_trg['state_column']:
                df_add = df_add.withColumn('_state', F.lit(0))
                df_del = df_del.withColumn('_state', F.lit(1))

                df = df_add.union(df_del)
            else:
                df = df_add

            result = self.save(df, md_trg, mode=mode)
        else:
            result = True

        log_data.update({
            'updated': updated,
            'records_read': num_rows,
            'records_add': rows_add,
            'records_del': rows_del,
            'columns': num_cols,
            'time': timer() - timer_start
        })

        logging.notice(log_data) if result else logging.error(log_data)