Пример #1
0
    def load_scd(self,
                 path,
                 provider,
                 format=None,
                 merge_on=None,
                 version=None,
                 where=None,
                 **kwargs):
        where = where or []
        where = where if isinstance(where, (list, tuple)) else [where]

        obj = self.load(path, provider, format=format, **kwargs)

        # push down filters asap
        for predicate in where:
            obj = obj.filter(predicate)

        # create view from history log
        return dataframe.view(obj, merge_on=merge_on, version=version)
Пример #2
0
    def copy(self, md_src, md_trg, mode='append'):
        # timer
        timer_start = timer()

        # src dataframe
        df_src = self.load(md_src)

        # if not path on target, get it from src
        if not md_trg['resource_path']:
            md_trg = resource.metadata(self._rootdir, self._metadata,
                                       md_src['resource_path'],
                                       md_trg['provider_alias'])

        # logging
        log_data = {
            'src_hash': md_src['hash'],
            'src_path': md_src['resource_path'],
            'trg_hash': md_trg['hash'],
            'trg_path': md_trg['resource_path'],
            'mode': mode,
            'updated': False,
            'records_read': 0,
            'records_add': 0,
            'records_del': 0,
            'columns': 0,
            'time': timer() - timer_start
        }

        # could not read source, log error and return
        if df_src is None:
            logging.error(log_data)
            return

        num_rows = df_src.count()
        num_cols = len(df_src.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            log_data['time'] = timer() - timer_start
            logging.notice(log_data)
            return

        # overwrite target, save, log notice/error and return
        if mode == 'overwrite':
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))

            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # trg dataframe (if exists)
        try:
            df_trg = self.load(md_trg, catch_exception=False)
        except:
            df_trg = dataframe.empty(df_src)

        # de-dup (exclude the _updated column)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg)

        # capture added records
        df_add = dataframe.diff(
            df_src, df_trg,
            ['_date', '_datetime', '_updated', '_hash', '_state'])
        rows_add = df_add.count()

        # capture deleted records
        rows_del = 0
        if md_trg['state_column']:
            df_del = dataframe.diff(
                df_trg, df_src,
                ['_date', '_datetime', '_updated', '_hash', '_state'])
            rows_del = df_del.count()

        updated = (rows_add + rows_del) > 0

        num_cols = len(df_add.columns)
        num_rows = max(df_src.count(), df_trg.count())

        # save diff
        if updated:
            if md_trg['state_column']:
                df_add = df_add.withColumn('_state', F.lit(0))
                df_del = df_del.withColumn('_state', F.lit(1))

                df = df_add.union(df_del)
            else:
                df = df_add

            result = self.save(df, md_trg, mode=mode)
        else:
            result = True

        log_data.update({
            'updated': updated,
            'records_read': num_rows,
            'records_add': rows_add,
            'records_del': rows_del,
            'columns': num_cols,
            'time': timer() - timer_start
        })

        logging.notice(log_data) if result else logging.error(log_data)
Пример #3
0
    def save_scd(self,
                 obj,
                 path=None,
                 provider=None,
                 *args,
                 format=None,
                 mode=None,
                 merge_on=None,
                 where=None,
                 **kwargs):

        result = True
        md = Resource(path, provider, format=format, mode=mode, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options.get('mode', None) or 'append'
        format = md['format'] or 'parquet'

        where = where or []
        where = where if isinstance(where, (list, tuple)) else [where]

        ts_start = timer()

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            return True

        # overwrite target, save, log notice/error and return
        if options['mode'] == 'overwrite':
            obj = obj.withColumn('_state', F.lit(0))
            obj = dataframe.add_update_column(obj, '_updated')

            result = self.save(obj, md, mode=options['mode'])
            self.save_log(md, options, ts_start)
            return True

        # append
        df_src = obj

        # trg dataframe (if exists)
        df_trg = self.load(md, format=format) or dataframe.empty(df_src)

        if '_state' not in df_trg.columns:
            df_trg = df_trg.withColumn('_state', F.lit(0))

        if '_updated' not in df_trg.columns:
            df_trg = dataframe.add_update_column(df_trg, '_updated')

        # filter src and trg (mainly speed reason: reduce diff time, but compare only a portion of all records)
        for predicate in where:
            df_src = df_src.filter(predicate)
            df_trg = df_trg.filter(predicate)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg, merge_on=merge_on)

        # schema change: add new columns
        added_cols = set(df_src.columns) - set(df_trg.columns)
        added_cols = {
            x.name: x.dataType
            for x in list(df_src.schema) if x.name in added_cols
        }
        for c, t in added_cols.items():
            df_trg = df_trg.withColumn(c, F.lit(None).cast(t))

        # schema change: removed columns
        # no need to do anything, diff will take care of that

        # capture added records
        df_add = dataframe.diff(df_src, df_trg, ['_updated', '_state'])

        # capture deleted records
        df_del = dataframe.diff(df_trg, df_src, ['_updated', '_state'])

        # capture updated records
        cnt_upd = 0
        if merge_on is not None:
            on = merge_on if isinstance(merge_on,
                                        (list, tuple)) else [merge_on]
            cnt_upd = df_add.join(df_del, on=on).count()

        cnt_del = df_del.count() - cnt_upd
        cnt_add = df_add.count() - cnt_upd

        logging.notice(
            f'merge on={merge_on}, updated={cnt_upd}, added={cnt_add}, deleted={cnt_del}'
        )

        df_add = df_add.withColumn('_state', F.lit(0))
        df_del = df_del.withColumn('_state', F.lit(1))

        df = df_add.union(df_del)
        df = dataframe.add_update_column(df, '_updated')

        result = self.save(df, md, format=format, **options)
        return result