Пример #1
0
def alert_on_z_score(df, check, alert_type, checked_txt, conf):
    df = df[df["result"].notnull()]

    if len(df) <= 1:
        return

    last_el_zscore = stats.zscore(df["result"])[-1]
    last_el = df["result"].iloc[-1]

    if math.isnan(last_el_zscore):
        return

    if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF:

        alert_desc = "above" if last_el_zscore > 0 else "below"

        alert = Alert(
            text=f"""
                {checked_txt},
                {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f}
            """,
            severity=2,
            table_id=check.table_id,
            alert_type=alert_type,
            created_at=conf.for_time,
        )

        metrics_session.add(alert)
        metrics_session.commit()
Пример #2
0
def add_run():
    run = Run(for_date=datetime.utcnow(),
              status='not started',
              run_type='scheduled')

    metrics_session.add(run)
    metrics_session.commit()
Пример #3
0
def create_column_checks(db, table):

    metrics = {}

    for col in table.schema["columns"]:
        if col["name"] in settings.SKIP_COLUMNS:
            continue
        if col["type"] not in db.numeric_types() + db.character_types():
            continue

        checks_for_col = []
        if col["type"] in db.numeric_types():
            checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL]
        elif col["type"] in db.character_types():
            checks_for_col = [el for el in Metric.FOR_TEXT_COL]

        metrics[col["name"]] = checks_for_col

    check = Check(
        table_id=table.id,
        name="column_values",
        metrics=metrics,
        query={
            "type": "standard",
            "path": f"redata.checks.data_values.check_column_values",
            "params": {
                "time_interval": "1 day"
            },
        },
    )

    metrics_session.add(check)
    metrics_session.commit()
Пример #4
0
def alert_for_schema_change(db, check, conf):

    df = get_last_results(db,
                          check,
                          Metric.TABLE_METRIC,
                          Metric.SCHEMA_CHANGE,
                          conf,
                          days=1)
    for index, row in df.iterrows():

        changes = json.loads(row[0])
        if changes["operation"] == "table detected":
            continue

        alert = Alert(
            text=f"""
                schema change detected - {changes['operation']}: {changes['column_name']}
            """,
            severity=2,
            table_id=check.table_id,
            alert_type=check.name,
            created_at=conf.for_time,
        )

        metrics_session.add(alert)
        metrics_session.commit()
Пример #5
0
def create_column_checks(db, table):

    metrics = {}

    for col in table.schema['columns']:
        if col['name'] in settings.SKIP_COLUMNS:
            continue
        if col['type'] not in db.numeric_types() + db.character_types():
            continue

        checks_for_col = []
        if col['type'] in db.numeric_types():
            checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL]
        elif col['type'] in db.character_types():
            checks_for_col = [el for el in Metric.FOR_TEXT_COL]

        metrics[col['name']] = checks_for_col

    check = Check(table_id=table.id,
                  name='column_values',
                  metrics=metrics,
                  query={
                      'type': 'standard',
                      'path': f'redata.checks.data_values.check_column_values',
                      'params': {
                          'time_interval': '1 day'
                      }
                  })

    metrics_session.add(check)
    metrics_session.commit()
Пример #6
0
def alert_on_z_score(df, table, check_col, alert_type, checked_txt):
    df = df[df[check_col].notnull()]

    if len(df) <= 1:
        return

    last_el_zscore = stats.zscore(df[check_col])[-1]
    last_el = df[check_col].iloc[-1]

    if math.isnan(last_el_zscore):
        return

    if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF:

        alert_desc = 'above' if last_el_zscore > 0 else 'below'

        print(f"Adding alert about table {table.table_name}")

        alert = Alert(text=f"""
                {checked_txt},
                {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f}
            """,
                      severity=2,
                      table_id=table.id,
                      alert_type=alert_type)

        metrics_session.add(alert)
        metrics_session.commit()
Пример #7
0
def insert_schema_changed_record(table, operation, column_name, column_type,
                                 column_count):
    metric = MetricsSchemaChanges(table_id=table.id,
                                  operation=operation,
                                  column_name=column_name,
                                  column_type=column_type,
                                  column_count=column_count)
    metrics_session.add(metric)
    metrics_session.commit()
Пример #8
0
def check_data_volume(db, table, time_interval):
    result = db.check_data_volume(table, time_interval)

    metric = MetricsDataVolume(table_id=table.id,
                               time_interval=time_interval,
                               count=result.count)

    metrics_session.add(metric)
    metrics_session.commit()
Пример #9
0
    def setup_for_source_table(cls, db, db_table_name):
        print (f"Running setup for {db_table_name}")

        valid_types = db.datetime_types()
        schema_cols = get_current_table_schema(db, db_table_name)

        table = MonitoredTable(
            table_name=db_table_name,
            schema={'columns': schema_cols},
            source_db=db.name
        )

        # heuristics to find best column to sort by when computing stats about data
        # TODO: could probably look up in a provided table of regex + score, with higher scored matches being preferred

        # list all date/timestamp columns, filtering out anything that's blacklisted in configuration
        blacklist_regex = settings.REDATA_TIME_COL_BLACKLIST_REGEX
        matching_cols = [col['name'] for col in schema_cols if col['type'] in valid_types and re.search(blacklist_regex, col['name']) is None]

        # from matches, collect time cols that have max values at or before "now"
        cols_by_ts = defaultdict(list)
        now_ts = datetime.datetime.now()
        for col in matching_cols:
            max_ts = db.get_max_timestamp(table, col)
            if max_ts <= now_ts:
                cols_by_ts[max_ts].append(col)

        # list of all viable candidates, ordered by latest timestamp first
        candidates = list(itertools.chain(
            *[cols for ts, cols in sorted(cols_by_ts.items(), reverse=True)]
        ))

        # list of preferred columns out of the viable ones, by name filtering
        preferred = [col for col in candidates if col.lower().find('creat') != -1]

        if len(candidates) == 0:
            # no columns found? ignore table..
            # TODO: add it, but set to disabled, for screening via web UI when we have one
            print (f"Not found column to sort by for {db_table_name}, skipping it for now")
            return None
        else:
            # if multiple columns found, primarily select from 'preferred' if exists, then set up the table
            col_name = preferred[0] if preferred else candidates[0]
            col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0]

            if len(candidates) > 1:
                print (f"Found multiple columns to sort by {candidates}, choosing {col_name}, please update in DB if needed")
            else:
                print (f"Found column to sort by {col_name}")

            table.time_column=col_name
            table.time_column_type=col_type

            metrics_session.add(table)
            metrics_session.commit()
            return table
Пример #10
0
def add_run():
    scan = Scan(
        start_date=datetime.utcnow(),
        end_date=datetime.utcnow(),
        status="not started",
        run_type="scheduled",
    )

    metrics_session.add(scan)
    metrics_session.commit()
Пример #11
0
def check_data_delayed(db, table, conf):
    result = db.check_data_delayed(table, conf)

    if result[0]:
        metric = MetricsDataDelay(table_id=table.id,
                                  value=result[0].total_seconds(),
                                  created_at=conf.for_time)

        metrics_session.add(metric)
        metrics_session.commit()
Пример #12
0
def check_data_delayed(db, table):
    result = db.check_data_delayed(table)

    if result[0]:
        metric = MetricsDataDelay(
            table_id=table.id,
            value=result[0].total_seconds()
        )

        metrics_session.add(metric)
        metrics_session.commit()
Пример #13
0
def check_generic(func_name, db, table, checked_column, time_interval):
    result = db.check_generic(func_name, table, checked_column, time_interval)

    metric = MetricsDataValues(table_id=table.id,
                               column_name=checked_column,
                               check_name=f'check_{func_name}',
                               check_value=result.value,
                               time_interval=time_interval)

    metrics_session.add(metric)
    metrics_session.commit()
Пример #14
0
def check_count_nulls(db, table, checked_column, time_interval):

    result = db.check_count_nulls(table, checked_column, time_interval)

    metric = MetricsDataValues(table_id=table.id,
                               column_name=checked_column,
                               check_name='check_count_nulls',
                               check_value=result.value,
                               time_interval=time_interval)

    metrics_session.add(metric)
    metrics_session.commit()
Пример #15
0
def check_if_schema_changed(db, table, check, conf):
    def schema_to_dict(schema):
        return dict([(el["name"], el["type"]) for el in schema])

    def sorted_to_compare(schema):
        return sorted(schema, key=lambda x: sorted(x.items()))

    last_schema = table.schema["columns"]
    table_name = table.table_name
    results = []

    current_schema = db.get_table_schema(table.table_name, table.namespace)

    if sorted_to_compare(last_schema) != sorted_to_compare(current_schema):
        last_dict = schema_to_dict(last_schema)
        current_dict = schema_to_dict(current_schema)

        for el in last_dict:
            if el not in current_dict:
                print(f"{el} was removed from schema")
                results.append(
                    schema_changed_record("column removed", el, last_dict[el],
                                          len(current_dict), conf))

        for el in current_dict:
            if el not in last_dict:
                print(f"{el} was added to schema")
                results.append(
                    schema_changed_record("column added", el, current_dict[el],
                                          len(current_dict), conf))
            else:
                prev_type = last_dict[el]
                curr_type = current_dict[el]

                if curr_type != prev_type:
                    print(
                        f"Type of column: {el} changed from {prev_type} to {curr_type}"
                    )
                    results.append(
                        schema_changed_record(
                            "column changed",
                            el,
                            current_dict[el],
                            len(current_dict),
                            conf,
                        ))

        table.schema = {"columns": current_schema}
        metrics_session.commit()

    return results
Пример #16
0
def check_count_per_value(db, table, checked_column, time_interval):
    result = db.check_count_per_value(table, checked_column, time_interval)

    for row in (result or []):

        metric = MetricsDataValues(table_id=table.id,
                                   column_name=checked_column,
                                   column_value=row.value,
                                   check_name='check_count_per_value',
                                   check_value=row.count,
                                   time_interval=time_interval)

        metrics_session.add(metric)
    metrics_session.commit()
Пример #17
0
    def create_admin_user_if_not_exist(cls):

        assert os.environ.get(
            'REDATA_ADMIN_USER'), 'please set env variable for admin user'
        assert os.environ.get('REDATA_ADMIN_PASSWORD'
                              ), 'please set env variable for admin password'

        is_admin = metrics_session.query(cls).filter(
            cls.login == os.environ.get('REDATA_ADMIN_USER')).count()
        if not is_admin:
            user = cls(login=os.environ.get('REDATA_ADMIN_USER'),
                       password=generate_password_hash(
                           os.environ.get('REDATA_ADMIN_PASSWORD')))
            metrics_session.add(user)
            metrics_session.commit()

            print("Created admin user")
Пример #18
0
def process_run():

    run = Run.get_not_started_run()
    if run is not None:
        run.status = 'pending'
        metrics_session.commit()

        conf = Conf(run.for_date)

        for source_db in DataSource.source_dbs():
            run_check_for_new_tables(source_db, conf)
            run_checks(source_db, conf)
            run_compute_alerts(source_db, conf)

        generate_grafana()

        run.status = 'success'
        metrics_session.commit()
Пример #19
0
def check_if_schema_changed(db, table, conf):
    def schema_to_dict(schema):
        return dict([(el['name'], el['type']) for el in schema])

    def sorted_to_compare(schema):
        return sorted(schema, key=lambda x: sorted(x.items()))

    last_schema = table.schema['columns']
    table_name = table.table_name

    current_schema = db.get_table_schema(table.table_name, table.namespace)

    if sorted_to_compare(last_schema) != sorted_to_compare(current_schema):
        last_dict = schema_to_dict(last_schema)
        current_dict = schema_to_dict(current_schema)

        for el in last_dict:
            if el not in current_dict:
                print(f"{el} was removed from schema")
                insert_schema_changed_record(table, 'column removed',
                                             el, last_dict[el],
                                             len(current_dict), conf)

        for el in current_dict:
            if el not in last_dict:
                print(f"{el} was added to schema")
                insert_schema_changed_record(table, 'column added',
                                             el, current_dict[el],
                                             len(current_dict), conf)
            else:
                prev_type = last_dict[el]
                curr_type = current_dict[el]

                if curr_type != prev_type:
                    print(
                        f"Type of column: {el} changed from {prev_type} to {curr_type}"
                    )
                    insert_schema_changed_record(table, 'column changed', el,
                                                 current_dict[el],
                                                 len(current_dict), conf)

        table.schema = {'columns': current_schema}
        metrics_session.commit()
Пример #20
0
def create_dashboards():
    grafana_api = GrafanaFace(
        auth=(settings.GF_SECURITY_ADMIN_USER,
              settings.GF_SECURITY_ADMIN_PASSWORD),
        host=f'{settings.GRAFANA_WEB_HOST}:{settings.GRAFANA_WEB_PORT}')

    create_source_in_grafana(grafana_api)
    create_notifcation_channels(grafana_api)
    dashboards = []

    for db in DataSource.source_dbs():
        monitored_tables = MonitoredTable.get_monitored_tables(db.name)
        for table in monitored_tables:
            dash_data = create_dashboard_for_table(grafana_api, db, table)
            table.grafana_url = dash_data['dashboard']['url']
            dashboards.append(dash_data)

    metrics_session.commit()
    home_response = create_home_dashboard(grafana_api, dashboards)
    star_home_dashboard(grafana_api, home_response)
Пример #21
0
    def add_metrics(cls, results, check, conf):

        print(f"Adding results for check: {check}")
        for row in results:

            for col, metrics in check.metrics.items():

                for m in metrics:
                    select_name = col + '_' + m if col != Metric.TABLE_METRIC else m

                    m = MetricFromCheck(check_id=check.id,
                                        table_id=check.table.id,
                                        table_column=col if col else None,
                                        params=check.query['params'],
                                        metric=m,
                                        result={'value': row[select_name]},
                                        created_at=conf.for_time)
                    metrics_session.add(m)

            metrics_session.commit()
Пример #22
0
def create_for_detected_table(db, table):

    for check in table_checks:

        func = check['func']
        metric_dict = {Metric.TABLE_METRIC: [check['metric']]}

        model_check = Check(table_id=table.id,
                            name=check['metric'],
                            metrics=metric_dict,
                            query={
                                'type': 'standard',
                                'path': f'redata.checks.{func}',
                                'params': check['params']
                            })

        metrics_session.add(model_check)
    metrics_session.commit()

    create_column_checks(db, table)
Пример #23
0
def check_if_schema_changed(db, table):
    def schema_to_dict(schema):
        return dict([(el['name'], el['type']) for el in schema])

    last_schema = table.schema['columns']
    table_name = table.table_name

    current_schema = db.get_table_schema(table.table_name)
    print(table.table_name, current_schema)

    if last_schema != current_schema:
        last_dict = schema_to_dict(last_schema)
        current_dict = schema_to_dict(current_schema)

        for el in last_dict:
            if el not in current_dict:
                print(f"{el} was removed from schema")
                insert_schema_changed_record(table, 'column removed', el,
                                             last_dict[el], len(current_dict))

        for el in current_dict:
            if el not in last_dict:
                print(f"{el} was added to schema")
                insert_schema_changed_record(table, 'column added',
                                             el, current_dict[el],
                                             len(current_dict))
            else:
                prev_type = last_dict[el]
                curr_type = current_dict[el]

                if curr_type != prev_type:
                    print(
                        f"Type of column: {el} changed from {prev_type} to {curr_type}"
                    )
                    insert_schema_changed_record(table, 'column added', el,
                                                 current_dict[el],
                                                 len(current_dict))

        table.schema = current_schema
        metrics_session.commit()
Пример #24
0
def process_run():

    try:

        scan = Scan.get_not_started_run()
        if scan is not None:
            scan.status = "pending"
            metrics_session.commit()

            for_time = scan.start_date
            while for_time <= scan.end_date:

                conf = Conf(for_time)

                for source_db in DataSource.source_dbs():
                    run_check_for_new_tables(source_db, conf)
                    run_checks(source_db, conf)
                    run_compute_alerts(source_db, conf)

                for_time += timedelta(days=1)

            generate_grafana()

            scan.status = "success"
            metrics_session.commit()

    except Exception:

        scan.status = "error"
        metrics_session.commit()
Пример #25
0
    def add_metrics(cls, results, check, conf):

        print(f"Adding results for check: {check}")
        for row in results:

            for col, metrics in check.metrics.items():

                for m in metrics:
                    select_name = name_for(col, m)

                    m = MetricFromCheck(
                        check_id=check.id,
                        table_id=check.table.id,
                        table_column=col,
                        params=check.query.get("params", {}),
                        metric=m,
                        result={"value": row[select_name]},
                        created_at=conf.for_time,
                    )
                    metrics_session.add(m)

            metrics_session.commit()
Пример #26
0
    def setup_for_source_table(cls, db, db_table_name):
        print (f"Running setup for {db_table_name}")

        preference = [
            'timestamp without time zone',
            'timestamp with time zone',
            'date',
            'datetime' #mysql
        ]
        schema_cols = get_current_table_schema(db, db_table_name)

        # heuristics to find best column to sort by when computing stats about data
        proper_type = [col['name'] for col in schema_cols if col['type'] in preference]
        columns = [c for c in proper_type if c.find('creat') != -1 ]

        colname, col_type = None, None

        if len(proper_type) == 0:
            print (f"Not found column to sort by for {db_table_name}, skipping it for now")
            return None
        else:
            if len(columns) > 1:
                print (f"Found multiple columns to sort by {columns}, choosing {columns[0]}, please update in DB if needed")

            col_name = columns[0] if columns else proper_type[0]
            col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0]
            print (f"Found column to sort by {col_name}")

            table = MonitoredTable(
                table_name=db_table_name,
                time_column=col_name,
                time_column_type=col_type,
                schema={'columns': schema_cols},
                source_db=db.name
            )
            
            metrics_session.add(table)
            metrics_session.commit()
            return table
Пример #27
0
def create_for_detected_table(db, table):

    for check in table_checks:

        func = check["func"]
        metric_dict = {Metric.TABLE_METRIC: [check["metric"]]}

        model_check = Check(
            table_id=table.id,
            name=check["metric"],
            metrics=metric_dict,
            query={
                "type": "standard",
                "path": f"redata.checks.{func}",
                "params": check["params"],
            },
        )

        metrics_session.add(model_check)
    metrics_session.commit()

    create_column_checks(db, table)
Пример #28
0
def check_data_volume_diff(db, table):
    from_time = metrics_db.execute(
        text("""
        SELECT max(created_at) as created_at
        FROM metrics_data_volume_diff
        WHERE table_id = :table_id
        """), {
            'table_id': table.id
        }).first()
    from_time = from_time.created_at if from_time else None

    if from_time is None:
        # if now previous diff computed, compute from start of day
        # mostly because we show that stat daily
        from_time = datetime.combine(date.today(), time())

    result = db.check_data_volume_diff(table, from_time=from_time)

    for r in (result or []):
        metric = MetricsDataVolumeDiff(table_id=table.id,
                                       date=r.date,
                                       count=r.count)
        metrics_session.add(metric)
    metrics_session.commit()
Пример #29
0
    def update_schema_for_table(cls, table, schema_cols):
        table = metrics_session.query(cls).filter(
            cls.table_name == table).first()

        table.schema = {'columns': schema_cols}
        metrics_session.commit()