def analyze(self, warehouse: Warehouse, table_name: str): dialect = warehouse.dialect qualified_table_name = dialect.qualify_table_name(table_name) analyze_results: List[ColumnAnalysisResult] = [] sql = dialect.sql_columns_metadata_query(table_name) column_tuples = warehouse.sql_fetchall(sql) for column_tuple in column_tuples: column_name = column_tuple[0] source_type = column_tuple[1] column_analysis_result = ColumnAnalysisResult( column_name=column_name, source_type=source_type) analyze_results.append(column_analysis_result) qualified_column_name = dialect.qualify_column_name(column_name) if dialect.is_text(source_type): column_analysis_result.is_text = True validity_format_count_fields = [] validity_counts = [] for validity_format in Validity.FORMATS: format_regex = Validity.FORMATS[validity_format] validity_counts.append({'format': validity_format}) qualified_regex = dialect.qualify_regex(format_regex) regexp_like = dialect.sql_expr_regexp_like( qualified_column_name, qualified_regex) count_field = f'COUNT(CASE WHEN {regexp_like} THEN 1 END)' validity_format_count_fields.append(count_field) row = warehouse.sql_fetchone( f'SELECT \n ' + (',\n '.join(validity_format_count_fields)) + ',\n' f' COUNT({qualified_column_name}) \n' f'FROM {qualified_table_name} \n' f'LIMIT 1000') values_count = row[len(validity_counts)] column_analysis_result.values_count = values_count if values_count > 0: for i in range(len(validity_counts)): validity_count = validity_counts[i] validity_count['count'] = row[i] sorted_validity_counts = sorted(validity_counts, key=lambda c: c['count'], reverse=True) most_frequent_validity_format = sorted_validity_counts[0] valid_count = most_frequent_validity_format['count'] column_analysis_result.valid_count = valid_count if valid_count > (values_count / 2): column_analysis_result.validity_format = most_frequent_validity_format[ 'format'] return analyze_results
def execute_metric(self, warehouse: Warehouse, metric: dict, scan_dict: dict = None): dialect = warehouse.dialect if not scan_dict: scan_dict = {} if KEY_TABLE_NAME not in scan_dict: scan_dict[KEY_TABLE_NAME] = self.default_test_table_name scan_configuration_parser = ScanYmlParser(scan_dict, 'test-scan') scan_configuration_parser.assert_no_warnings_or_errors() scan = warehouse.create_scan( scan_yml=scan_configuration_parser.scan_yml) scan.close_warehouse = False scan.execute() fields: List[str] = [] group_by_column_names: List[str] = metric.get('groupBy') if group_by_column_names: for group_by_column in group_by_column_names: fields.append(dialect.qualify_column_name(group_by_column)) column_name: str = metric.get('columnName') qualified_column_name = dialect.qualify_column_name(column_name) metric_type = metric['type'] if metric_type == Metric.ROW_COUNT: fields.append('COUNT(*)') if metric_type == Metric.MIN: fields.append(f'MIN({qualified_column_name})') elif metric_type == Metric.MAX: fields.append(f'MAX({qualified_column_name})') elif metric_type == Metric.SUM: fields.append(f'SUM({qualified_column_name})') sql = 'SELECT \n ' + ',\n '.join(fields) + ' \n' \ 'FROM ' + scan.qualified_table_name where_clauses = [] metric_filter = metric.get('filter') if metric_filter: where_clauses.append(dialect.sql_expression(metric_filter)) scan_column: ScanColumn = scan.scan_columns.get(column_name) if scan_column and scan_column.non_missing_and_valid_condition: where_clauses.append(scan_column.non_missing_and_valid_condition) if where_clauses: sql += '\nWHERE ' + '\n AND '.join(where_clauses) if group_by_column_names: sql += '\nGROUP BY ' + ', '.join(group_by_column_names) return warehouse.sql_fetchall(sql)
def __init__(self, target: str) -> None: super().__init__() self.target: str = target self.dialect = self.create_dialect(self.target) self.warehouse_yml = WarehouseYml(dialect=self.dialect, name='test_warehouse') self.warehouse: Optional[Warehouse] = Warehouse(self.warehouse_yml) self.database: Optional[str] = None self.create_database()
def build(self): self._build_warehouse_yml() self._build_scan_yml() for parser in self.parsers: parser.assert_no_warnings_or_errors() if not self.scan_yml or not self.warehouse_yml: return from sodasql.scan.warehouse import Warehouse warehouse = Warehouse(self.warehouse_yml) self._create_soda_server_client() return Scan(warehouse=warehouse, scan_yml=self.scan_yml, variables=self.variables, soda_server_client=self.soda_server_client, time=self.time)
def setup_get_warehouse_fixture(self): """self.target may be initialized by a test suite""" if self.target is None: self.target = os.getenv('SODA_TEST_TARGET', TARGET_POSTGRES) warehouse_fixture = SqlTestCase.warehouse_fixtures_by_target.get( self.target) if warehouse_fixture is None: logging.debug(f'Creating warehouse {self.target}') warehouse_fixture = WarehouseFixture.create(self.target) dialect = self.create_dialect(self.target) warehouse_yml = WarehouseYml(dialect=dialect) warehouse_fixture.warehouse = Warehouse(warehouse_yml) warehouse_fixture.create_database() SqlTestCase.warehouse_fixtures_by_target[ self.target] = warehouse_fixture return warehouse_fixture
def init(warehouse_file: str): """ Finds tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates files in a subdirectory called "tables" on the same level as the warehouse file. WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command. The warehouse file argument is optional and defaults to 'warehouse.yml'. """ logging.info(SODA_SQL_VERSION) file_system = FileSystemSingleton.INSTANCE warehouse = None try: logging.info(f'Initializing {warehouse_file} ...') warehouse_yml_dict = read_warehouse_yml_file(warehouse_file) warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict, warehouse_file) warehouse = Warehouse(warehouse_yml_parser.warehouse_yml) logging.info('Querying warehouse for tables') warehouse_dir = file_system.dirname(warehouse_file) scan_initializer = ScanInitializer(warehouse, warehouse_dir) scan_initializer.initialize_scan_ymls() logging.info( f"Next run 'soda scan {warehouse_file} {scan_initializer.first_table_scan_yml_file}' to calculate measurements and run tests" ) except Exception as e: logging.exception(f'Exception: {str(e)}') return 1 finally: if warehouse and warehouse.connection: try: warehouse.connection.close() except Exception as e: logging.debug(f'Closing connection failed: {str(e)}')
def build(self, offline: bool = False): self._build_warehouse_yml() self._build_scan_yml() for parser in self.parsers: parser.assert_no_warnings_or_errors() if not self.scan_yml or not self.warehouse_yml: return from sodasql.scan.warehouse import Warehouse warehouse = Warehouse(self.warehouse_yml) if not offline: self._create_soda_server_client() return Scan(warehouse=warehouse, scan_yml=self.scan_yml, variables=self.variables, soda_server_client=self.soda_server_client, time=self.time, scan_results_file=self.scan_results_json_path, failed_rows_processor=self.failed_rows_processor)
KEY_METRICS: ['distinct', 'uniqueness'], 'tests': { 'nomissing': 'missing_percentage < 3.0', 'noinvalid': 'invalid_count == 0' } } } } scan_configuration_parser = ScanYmlParser(scan_configuration_dict, 'demodata-scan') scan_configuration_parser.assert_no_warnings_or_errors() dialect = SqlTestCase.create_dialect('postgres') warehouse_yml = WarehouseYml(dialect=dialect) warehouse = Warehouse(warehouse_yml) row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata') min_date = row[0] max_date = row[1] scan_results = [] date = min_date while date != max_date: timeslice = datetime(year=date.year, month=date.month, day=date.day).isoformat() variables = {'date': date.strftime("%Y-%m-%d")} scan = Scan(warehouse=warehouse, scan_yml=scan_configuration_parser.scan_yml, variables=variables,
def analyze(warehouse_file: str, include: str, exclude: str): """ Analyzes tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates files in a subdirectory called "tables" on the same level as the warehouse file. WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command. The warehouse file argument is optional and defaults to 'warehouse.yml'. """ logging.info(SODA_SQL_VERSION) file_system = FileSystemSingleton.INSTANCE warehouse = None try: logging.info(f'Analyzing {warehouse_file} ...') warehouse_yml_dict = read_warehouse_yml_file(warehouse_file) warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict, warehouse_file) warehouse = Warehouse(warehouse_yml_parser.warehouse_yml) logging.info('Querying warehouse for tables') warehouse_dir = file_system.dirname(warehouse_file) file_system = FileSystemSingleton.INSTANCE def fileify(name: str): return re.sub(r'[^A-Za-z0-9_.]+', '_', name).lower() table_dir = file_system.join(warehouse_dir, 'tables') if not file_system.file_exists(table_dir): logging.info(f'Creating tables directory {table_dir}') file_system.mkdirs(table_dir) else: logging.info(f'Directory {table_dir} already exists') first_table_scan_yml_file = None dialect = warehouse.dialect tables_metadata_query = dialect.sql_tables_metadata_query() rows = warehouse.sql_fetchall(tables_metadata_query) table_include_regex = create_table_filter_regex(include) table_exclude_regex = create_table_filter_regex(exclude) for row in rows: table_name = row[0] if (matches_table_include(table_name, table_include_regex) and matches_table_exclude(table_name, table_exclude_regex)): dataset_analyzer = DatasetAnalyzer() dataset_analyze_results = dataset_analyzer.analyze( warehouse, table_name) table_scan_yaml_file = file_system.join( table_dir, f'{fileify(table_name)}.yml') if not first_table_scan_yml_file: first_table_scan_yml_file = table_scan_yaml_file if file_system.file_exists(table_scan_yaml_file): logging.info( f"Scan file {table_scan_yaml_file} already exists") else: logging.info(f"Creating {table_scan_yaml_file} ...") from sodasql.scan.scan_yml_parser import ( KEY_METRICS, KEY_TABLE_NAME, KEY_TESTS, KEY_COLUMNS, COLUMN_KEY_VALID_FORMAT, COLUMN_KEY_TESTS) scan_yaml_dict = { KEY_TABLE_NAME: table_name, KEY_METRICS: [Metric.ROW_COUNT] + Metric.METRIC_GROUPS[Metric.METRIC_GROUP_MISSING] + Metric.METRIC_GROUPS[Metric.METRIC_GROUP_VALIDITY] + Metric.METRIC_GROUPS[Metric.METRIC_GROUP_LENGTH] + Metric.METRIC_GROUPS[Metric.METRIC_GROUP_STATISTICS], KEY_TESTS: ['row_count > 0'] } columns = {} for column_analysis_result in dataset_analyze_results: if column_analysis_result.validity_format: column_yml = { COLUMN_KEY_VALID_FORMAT: column_analysis_result.validity_format } values_count = column_analysis_result.values_count valid_count = column_analysis_result.valid_count if valid_count > (values_count * .8): valid_percentage = valid_count * 100 / values_count invalid_threshold = (100 - valid_percentage) * 1.1 invalid_threshold_rounded = ceil( invalid_threshold) invalid_comparator = '==' if invalid_threshold_rounded == 0 else '<=' column_yml[COLUMN_KEY_TESTS] = [ f'invalid_percentage {invalid_comparator} {str(invalid_threshold_rounded)}' ] columns[column_analysis_result. column_name] = column_yml if columns: scan_yaml_dict[KEY_COLUMNS] = columns scan_yml_str = yaml.dump(scan_yaml_dict, sort_keys=False, Dumper=IndentingDumper, default_flow_style=False) file_system.file_write_from_str(table_scan_yaml_file, scan_yml_str) else: logging.info(f"Skipping table {table_name}") logging.info( f"Next run 'soda scan {warehouse_file} {first_table_scan_yml_file}' to calculate measurements and run tests" ) except Exception as e: logging.exception(f'Exception: {str(e)}') sys.exit(1) finally: if warehouse and warehouse.connection: try: warehouse.connection.close() except Exception as e: logging.debug(f'Closing connection failed: {str(e)}')
KEY_METRICS: ['distinct', 'uniqueness'], 'tests': { 'nomissing': 'missing_percentage < 3.0', 'noinvalid': 'invalid_count == 0' } } } } scan_configuration_parser = ScanYmlParser(scan_configuration_dict, 'demodata-scan') scan_configuration_parser.assert_no_warnings_or_errors() dialect = SqlTestCase.create_dialect('postgres') warehouse_configuration = WarehouseYml(dialect=dialect) warehouse = Warehouse(warehouse_configuration) row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata') min_date = row[0] max_date = row[1] scan_results = [] date = min_date while date != max_date: timeslice = datetime(year=date.year, month=date.month, day=date.day).isoformat() variables = {'date': date.strftime("%Y-%m-%d")} scan = Scan(warehouse=warehouse, scan_yml=scan_configuration_parser.scan_yml, variables=variables,