def alert_on_z_score(df, check, alert_type, checked_txt, conf): df = df[df["result"].notnull()] if len(df) <= 1: return last_el_zscore = stats.zscore(df["result"])[-1] last_el = df["result"].iloc[-1] if math.isnan(last_el_zscore): return if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF: alert_desc = "above" if last_el_zscore > 0 else "below" alert = Alert( text=f""" {checked_txt}, {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f} """, severity=2, table_id=check.table_id, alert_type=alert_type, created_at=conf.for_time, ) metrics_session.add(alert) metrics_session.commit()
def add_run(): run = Run(for_date=datetime.utcnow(), status='not started', run_type='scheduled') metrics_session.add(run) metrics_session.commit()
def create_column_checks(db, table): metrics = {} for col in table.schema["columns"]: if col["name"] in settings.SKIP_COLUMNS: continue if col["type"] not in db.numeric_types() + db.character_types(): continue checks_for_col = [] if col["type"] in db.numeric_types(): checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL] elif col["type"] in db.character_types(): checks_for_col = [el for el in Metric.FOR_TEXT_COL] metrics[col["name"]] = checks_for_col check = Check( table_id=table.id, name="column_values", metrics=metrics, query={ "type": "standard", "path": f"redata.checks.data_values.check_column_values", "params": { "time_interval": "1 day" }, }, ) metrics_session.add(check) metrics_session.commit()
def alert_for_schema_change(db, check, conf): df = get_last_results(db, check, Metric.TABLE_METRIC, Metric.SCHEMA_CHANGE, conf, days=1) for index, row in df.iterrows(): changes = json.loads(row[0]) if changes["operation"] == "table detected": continue alert = Alert( text=f""" schema change detected - {changes['operation']}: {changes['column_name']} """, severity=2, table_id=check.table_id, alert_type=check.name, created_at=conf.for_time, ) metrics_session.add(alert) metrics_session.commit()
def create_column_checks(db, table): metrics = {} for col in table.schema['columns']: if col['name'] in settings.SKIP_COLUMNS: continue if col['type'] not in db.numeric_types() + db.character_types(): continue checks_for_col = [] if col['type'] in db.numeric_types(): checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL] elif col['type'] in db.character_types(): checks_for_col = [el for el in Metric.FOR_TEXT_COL] metrics[col['name']] = checks_for_col check = Check(table_id=table.id, name='column_values', metrics=metrics, query={ 'type': 'standard', 'path': f'redata.checks.data_values.check_column_values', 'params': { 'time_interval': '1 day' } }) metrics_session.add(check) metrics_session.commit()
def alert_on_z_score(df, table, check_col, alert_type, checked_txt): df = df[df[check_col].notnull()] if len(df) <= 1: return last_el_zscore = stats.zscore(df[check_col])[-1] last_el = df[check_col].iloc[-1] if math.isnan(last_el_zscore): return if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF: alert_desc = 'above' if last_el_zscore > 0 else 'below' print(f"Adding alert about table {table.table_name}") alert = Alert(text=f""" {checked_txt}, {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f} """, severity=2, table_id=table.id, alert_type=alert_type) metrics_session.add(alert) metrics_session.commit()
def insert_schema_changed_record(table, operation, column_name, column_type, column_count): metric = MetricsSchemaChanges(table_id=table.id, operation=operation, column_name=column_name, column_type=column_type, column_count=column_count) metrics_session.add(metric) metrics_session.commit()
def check_data_volume(db, table, time_interval): result = db.check_data_volume(table, time_interval) metric = MetricsDataVolume(table_id=table.id, time_interval=time_interval, count=result.count) metrics_session.add(metric) metrics_session.commit()
def setup_for_source_table(cls, db, db_table_name): print (f"Running setup for {db_table_name}") valid_types = db.datetime_types() schema_cols = get_current_table_schema(db, db_table_name) table = MonitoredTable( table_name=db_table_name, schema={'columns': schema_cols}, source_db=db.name ) # heuristics to find best column to sort by when computing stats about data # TODO: could probably look up in a provided table of regex + score, with higher scored matches being preferred # list all date/timestamp columns, filtering out anything that's blacklisted in configuration blacklist_regex = settings.REDATA_TIME_COL_BLACKLIST_REGEX matching_cols = [col['name'] for col in schema_cols if col['type'] in valid_types and re.search(blacklist_regex, col['name']) is None] # from matches, collect time cols that have max values at or before "now" cols_by_ts = defaultdict(list) now_ts = datetime.datetime.now() for col in matching_cols: max_ts = db.get_max_timestamp(table, col) if max_ts <= now_ts: cols_by_ts[max_ts].append(col) # list of all viable candidates, ordered by latest timestamp first candidates = list(itertools.chain( *[cols for ts, cols in sorted(cols_by_ts.items(), reverse=True)] )) # list of preferred columns out of the viable ones, by name filtering preferred = [col for col in candidates if col.lower().find('creat') != -1] if len(candidates) == 0: # no columns found? ignore table.. # TODO: add it, but set to disabled, for screening via web UI when we have one print (f"Not found column to sort by for {db_table_name}, skipping it for now") return None else: # if multiple columns found, primarily select from 'preferred' if exists, then set up the table col_name = preferred[0] if preferred else candidates[0] col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0] if len(candidates) > 1: print (f"Found multiple columns to sort by {candidates}, choosing {col_name}, please update in DB if needed") else: print (f"Found column to sort by {col_name}") table.time_column=col_name table.time_column_type=col_type metrics_session.add(table) metrics_session.commit() return table
def add_run(): scan = Scan( start_date=datetime.utcnow(), end_date=datetime.utcnow(), status="not started", run_type="scheduled", ) metrics_session.add(scan) metrics_session.commit()
def check_data_delayed(db, table, conf): result = db.check_data_delayed(table, conf) if result[0]: metric = MetricsDataDelay(table_id=table.id, value=result[0].total_seconds(), created_at=conf.for_time) metrics_session.add(metric) metrics_session.commit()
def check_data_delayed(db, table): result = db.check_data_delayed(table) if result[0]: metric = MetricsDataDelay( table_id=table.id, value=result[0].total_seconds() ) metrics_session.add(metric) metrics_session.commit()
def check_generic(func_name, db, table, checked_column, time_interval): result = db.check_generic(func_name, table, checked_column, time_interval) metric = MetricsDataValues(table_id=table.id, column_name=checked_column, check_name=f'check_{func_name}', check_value=result.value, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def check_count_nulls(db, table, checked_column, time_interval): result = db.check_count_nulls(table, checked_column, time_interval) metric = MetricsDataValues(table_id=table.id, column_name=checked_column, check_name='check_count_nulls', check_value=result.value, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def check_if_schema_changed(db, table, check, conf): def schema_to_dict(schema): return dict([(el["name"], el["type"]) for el in schema]) def sorted_to_compare(schema): return sorted(schema, key=lambda x: sorted(x.items())) last_schema = table.schema["columns"] table_name = table.table_name results = [] current_schema = db.get_table_schema(table.table_name, table.namespace) if sorted_to_compare(last_schema) != sorted_to_compare(current_schema): last_dict = schema_to_dict(last_schema) current_dict = schema_to_dict(current_schema) for el in last_dict: if el not in current_dict: print(f"{el} was removed from schema") results.append( schema_changed_record("column removed", el, last_dict[el], len(current_dict), conf)) for el in current_dict: if el not in last_dict: print(f"{el} was added to schema") results.append( schema_changed_record("column added", el, current_dict[el], len(current_dict), conf)) else: prev_type = last_dict[el] curr_type = current_dict[el] if curr_type != prev_type: print( f"Type of column: {el} changed from {prev_type} to {curr_type}" ) results.append( schema_changed_record( "column changed", el, current_dict[el], len(current_dict), conf, )) table.schema = {"columns": current_schema} metrics_session.commit() return results
def check_count_per_value(db, table, checked_column, time_interval): result = db.check_count_per_value(table, checked_column, time_interval) for row in (result or []): metric = MetricsDataValues(table_id=table.id, column_name=checked_column, column_value=row.value, check_name='check_count_per_value', check_value=row.count, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def create_admin_user_if_not_exist(cls): assert os.environ.get( 'REDATA_ADMIN_USER'), 'please set env variable for admin user' assert os.environ.get('REDATA_ADMIN_PASSWORD' ), 'please set env variable for admin password' is_admin = metrics_session.query(cls).filter( cls.login == os.environ.get('REDATA_ADMIN_USER')).count() if not is_admin: user = cls(login=os.environ.get('REDATA_ADMIN_USER'), password=generate_password_hash( os.environ.get('REDATA_ADMIN_PASSWORD'))) metrics_session.add(user) metrics_session.commit() print("Created admin user")
def process_run(): run = Run.get_not_started_run() if run is not None: run.status = 'pending' metrics_session.commit() conf = Conf(run.for_date) for source_db in DataSource.source_dbs(): run_check_for_new_tables(source_db, conf) run_checks(source_db, conf) run_compute_alerts(source_db, conf) generate_grafana() run.status = 'success' metrics_session.commit()
def check_if_schema_changed(db, table, conf): def schema_to_dict(schema): return dict([(el['name'], el['type']) for el in schema]) def sorted_to_compare(schema): return sorted(schema, key=lambda x: sorted(x.items())) last_schema = table.schema['columns'] table_name = table.table_name current_schema = db.get_table_schema(table.table_name, table.namespace) if sorted_to_compare(last_schema) != sorted_to_compare(current_schema): last_dict = schema_to_dict(last_schema) current_dict = schema_to_dict(current_schema) for el in last_dict: if el not in current_dict: print(f"{el} was removed from schema") insert_schema_changed_record(table, 'column removed', el, last_dict[el], len(current_dict), conf) for el in current_dict: if el not in last_dict: print(f"{el} was added to schema") insert_schema_changed_record(table, 'column added', el, current_dict[el], len(current_dict), conf) else: prev_type = last_dict[el] curr_type = current_dict[el] if curr_type != prev_type: print( f"Type of column: {el} changed from {prev_type} to {curr_type}" ) insert_schema_changed_record(table, 'column changed', el, current_dict[el], len(current_dict), conf) table.schema = {'columns': current_schema} metrics_session.commit()
def create_dashboards(): grafana_api = GrafanaFace( auth=(settings.GF_SECURITY_ADMIN_USER, settings.GF_SECURITY_ADMIN_PASSWORD), host=f'{settings.GRAFANA_WEB_HOST}:{settings.GRAFANA_WEB_PORT}') create_source_in_grafana(grafana_api) create_notifcation_channels(grafana_api) dashboards = [] for db in DataSource.source_dbs(): monitored_tables = MonitoredTable.get_monitored_tables(db.name) for table in monitored_tables: dash_data = create_dashboard_for_table(grafana_api, db, table) table.grafana_url = dash_data['dashboard']['url'] dashboards.append(dash_data) metrics_session.commit() home_response = create_home_dashboard(grafana_api, dashboards) star_home_dashboard(grafana_api, home_response)
def add_metrics(cls, results, check, conf): print(f"Adding results for check: {check}") for row in results: for col, metrics in check.metrics.items(): for m in metrics: select_name = col + '_' + m if col != Metric.TABLE_METRIC else m m = MetricFromCheck(check_id=check.id, table_id=check.table.id, table_column=col if col else None, params=check.query['params'], metric=m, result={'value': row[select_name]}, created_at=conf.for_time) metrics_session.add(m) metrics_session.commit()
def create_for_detected_table(db, table): for check in table_checks: func = check['func'] metric_dict = {Metric.TABLE_METRIC: [check['metric']]} model_check = Check(table_id=table.id, name=check['metric'], metrics=metric_dict, query={ 'type': 'standard', 'path': f'redata.checks.{func}', 'params': check['params'] }) metrics_session.add(model_check) metrics_session.commit() create_column_checks(db, table)
def check_if_schema_changed(db, table): def schema_to_dict(schema): return dict([(el['name'], el['type']) for el in schema]) last_schema = table.schema['columns'] table_name = table.table_name current_schema = db.get_table_schema(table.table_name) print(table.table_name, current_schema) if last_schema != current_schema: last_dict = schema_to_dict(last_schema) current_dict = schema_to_dict(current_schema) for el in last_dict: if el not in current_dict: print(f"{el} was removed from schema") insert_schema_changed_record(table, 'column removed', el, last_dict[el], len(current_dict)) for el in current_dict: if el not in last_dict: print(f"{el} was added to schema") insert_schema_changed_record(table, 'column added', el, current_dict[el], len(current_dict)) else: prev_type = last_dict[el] curr_type = current_dict[el] if curr_type != prev_type: print( f"Type of column: {el} changed from {prev_type} to {curr_type}" ) insert_schema_changed_record(table, 'column added', el, current_dict[el], len(current_dict)) table.schema = current_schema metrics_session.commit()
def process_run(): try: scan = Scan.get_not_started_run() if scan is not None: scan.status = "pending" metrics_session.commit() for_time = scan.start_date while for_time <= scan.end_date: conf = Conf(for_time) for source_db in DataSource.source_dbs(): run_check_for_new_tables(source_db, conf) run_checks(source_db, conf) run_compute_alerts(source_db, conf) for_time += timedelta(days=1) generate_grafana() scan.status = "success" metrics_session.commit() except Exception: scan.status = "error" metrics_session.commit()
def add_metrics(cls, results, check, conf): print(f"Adding results for check: {check}") for row in results: for col, metrics in check.metrics.items(): for m in metrics: select_name = name_for(col, m) m = MetricFromCheck( check_id=check.id, table_id=check.table.id, table_column=col, params=check.query.get("params", {}), metric=m, result={"value": row[select_name]}, created_at=conf.for_time, ) metrics_session.add(m) metrics_session.commit()
def setup_for_source_table(cls, db, db_table_name): print (f"Running setup for {db_table_name}") preference = [ 'timestamp without time zone', 'timestamp with time zone', 'date', 'datetime' #mysql ] schema_cols = get_current_table_schema(db, db_table_name) # heuristics to find best column to sort by when computing stats about data proper_type = [col['name'] for col in schema_cols if col['type'] in preference] columns = [c for c in proper_type if c.find('creat') != -1 ] colname, col_type = None, None if len(proper_type) == 0: print (f"Not found column to sort by for {db_table_name}, skipping it for now") return None else: if len(columns) > 1: print (f"Found multiple columns to sort by {columns}, choosing {columns[0]}, please update in DB if needed") col_name = columns[0] if columns else proper_type[0] col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0] print (f"Found column to sort by {col_name}") table = MonitoredTable( table_name=db_table_name, time_column=col_name, time_column_type=col_type, schema={'columns': schema_cols}, source_db=db.name ) metrics_session.add(table) metrics_session.commit() return table
def create_for_detected_table(db, table): for check in table_checks: func = check["func"] metric_dict = {Metric.TABLE_METRIC: [check["metric"]]} model_check = Check( table_id=table.id, name=check["metric"], metrics=metric_dict, query={ "type": "standard", "path": f"redata.checks.{func}", "params": check["params"], }, ) metrics_session.add(model_check) metrics_session.commit() create_column_checks(db, table)
def check_data_volume_diff(db, table): from_time = metrics_db.execute( text(""" SELECT max(created_at) as created_at FROM metrics_data_volume_diff WHERE table_id = :table_id """), { 'table_id': table.id }).first() from_time = from_time.created_at if from_time else None if from_time is None: # if now previous diff computed, compute from start of day # mostly because we show that stat daily from_time = datetime.combine(date.today(), time()) result = db.check_data_volume_diff(table, from_time=from_time) for r in (result or []): metric = MetricsDataVolumeDiff(table_id=table.id, date=r.date, count=r.count) metrics_session.add(metric) metrics_session.commit()
def update_schema_for_table(cls, table, schema_cols): table = metrics_session.query(cls).filter( cls.table_name == table).first() table.schema = {'columns': schema_cols} metrics_session.commit()