def validate_params(cls, values): """Make sure that all params for given aggregation type are set and valid""" agg_type = EnumHelper.from_value_safe(AggregationType, values.get('type')) if not agg_type: return values if values['type'] == AggregationType.not_set or values[ 'type'] in cls._SIMPLE_AGGS: values['params'] = None return values if 'params' not in values: raise ValueError('Missing "params" field') if agg_type in cls._WITH_SORT_DIMENSION_AGGS: values['params'] = AggregationParamsSortDimension( **values['params']) elif agg_type == AggregationType.count_distinct.value: values['params'] = AggregationParamsCountDistinct( **values['params']) else: raise ValueError( f'Unsupported aggregation type - {values["type"]}') return values
def scaffold_missing_fields(target_dataset: Optional[str] = None, yes: bool = False, no_remote: bool = True): """Scaffold missing field files.""" echo_info('Loading local state...') state = get_local_state(target_dataset=target_dataset) errors = [] for dataset, (fields, models) in state.get_objects_by_package().items(): for idx, error in enumerate( validate_missing_files(fields, models, package_name=dataset)): if idx == 0: echo_info( f'\nFields referenced in models without definition in dataset {dataset}:' ) echo_info(f' {error.field_slug}') errors.append(error) if len(errors) == 0: echo_info('No issues found') return echo_info('') if not yes and not click.confirm( 'You will not be able to query these fields until you define them. Do you want to do that now?' ): # User decided not to fix issues return loaded_models: Dict[str, PanoModel] = {} if not no_remote: connection = Connection.get() dialect_name = Connection.get_dialect_name(connection) query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, dialect_name) scanner_cls = Scanner.get_scanner(query_runtime) scanner = scanner_cls() echo_info('Scanning remote storage...') scanner.scan() echo_info('Finished scanning remote storage...') loaded_models = scanner.models echo_info('Scanning fields...') fields = scan_fields_for_errors(errors, loaded_models) action_list = ActionList( actions=[Action(desired=field) for field in fields]) echo_info('Updating local state...') executor = LocalExecutor() for action in action_list.actions: try: executor.execute(action) except Exception: echo_error(f'Error: Failed to execute action {action.description}') echo_info( f'Updated {executor.success_count}/{executor.total_count} fields')
def to_husky(origin: PanoField) -> Taxon: """Maps external field definitions to internal taxon representation""" slug = origin.slug if origin.data_source is None else f'{origin.data_source}{NAMESPACE_DELIMITER}{origin.slug}' aggregation = None if origin.aggregation: aggregation = AggregationDefinition.parse_obj( origin.aggregation.to_dict()) validation = EnumHelper.from_value(ValidationType, origin.data_type) assert validation return Taxon( slug=slug, taxon_group=origin.group, display_name=origin.display_name, taxon_type=origin.field_type, validation_type=validation, taxon_description=origin.description, data_source=origin.data_source, calculation=origin.calculation, aggregation=aggregation, display_state=DisplayState.visible, company_id=get_company_id(), )
def _detect_filter_clause_type( data: Dict[str, Any], expected_type: FilterClauseType, filter_clause: Type['FilterClause'] ) -> Optional[Type['FilterClause']]: if 'type' not in data: return None if EnumHelper.from_value_safe(FilterClauseType, data['type']) == expected_type: return filter_clause return None
def compile_transformation_request(cls, req: TransformRequest, company_id: str) -> Tuple[str, HuskyQueryRuntime]: """ Compiles Transform request to its SQL representation :param req: Input request :param company_id: Company ID :return: SQL and type of dialect """ sorted_fields = sorted(req.requested_fields) # prepare origin description origin = DataRequestOrigin( { 'system': 'FDQ', 'extra': { 'purpose': 'taxonomy.transform.compile', }, } ) # get all used taxons in the request used_taxons_map = fetch_all_used_taxons_map(company_id, sorted_fields) # figure out set of all virtual data sources covered by the taxons in the request used_vds = {taxon.data_source for taxon in used_taxons_map.values() if taxon.data_source} # generate subrequest for each virtual data source # this will allow Husky to push the taxons into relevant subrequests subrequests = [] for vds in sorted(used_vds): subrequest = ApiDataRequest({'scope': {'company_id': company_id}, 'properties': {'data_sources': [vds]}}) subrequests.append(subrequest) # finalize the blending husky request husky_request_dict = {'data_subrequests': subrequests, 'taxons': req.requested_fields, 'origin': origin} husky_request = BlendingDataRequest(husky_request_dict) connection = Connection.get() query_runtime_name = Connection.get_dialect_name(connection) query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, query_runtime_name) context = HuskyQueryContext(query_runtime) husky_dataframe = QueryBuilder.validate_data_request(context, husky_request) # add another layer of query to use correct names final_query = cls._correct_column_aliases(context, husky_dataframe) return compile_query(final_query, context.dialect), context.query_runtime
def from_request(cls, data_request: Union[BlendingDataRequest, InternalDataRequest]): if data_request.physical_data_sources: if len(data_request.physical_data_sources) == 1: request_pds = data_request.physical_data_sources[0] connection = Connections.get_by_name(request_pds, True) query_runtime_name = Connections.get_connection_engine(connection).dialect.name query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, query_runtime_name) if query_runtime is None: raise UnsupportedSQLOutputException(query_runtime_name) return cls(query_runtime) elif len(data_request.physical_data_sources) > 1: raise TooManyPhysicalDataSourcesException(data_request.physical_data_sources) else: return cls(HuskyQueryRuntime.snowflake)
def scan(filter_reg_ex: Optional[str] = None): """Scan all metadata for given source and filter.""" connection_info = Connection.get() dialect_name = Connection.get_dialect_name(connection_info) query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, dialect_name) if not query_runtime: raise UnsupportedDialectError(dialect_name) scanner_cls = Scanner.get_scanner(query_runtime) scanner = scanner_cls() echo_info('Started scanning the data source') scanner.scan(force_reset=True) echo_info('Finished scanning the data source') # apply regular expression as a filter on model names if filter_reg_ex: re_compiled = re.compile(filter_reg_ex) models = [ model for model in scanner.models.values() if re_compiled.match(model.model_name) ] else: models = list(scanner.models.values()) if len(scanner.models) == 0: echo_info('No tables have been found') return progress_bar = tqdm(total=len(scanner.models)) writer = FileWriter() for model in models: writer.write_scanned_model(model) progress_bar.write(f'Discovered model {model.model_name}') progress_bar.update() progress_bar.write(f'Scanned {progress_bar.total} tables')
def _parse_taxon_expr( ctx: HuskyQueryContext, taxon: Taxon, tel_prefix: str, data_sources: Iterable[str], all_taxons: TaxonMap, subrequest_only=False, ): taxon_type = EnumHelper.from_value(TaxonTypeEnum, taxon.taxon_type) try: return TaxonTelDialect().render( expr=cast(str, taxon.calculation), ctx=ctx, taxon_map=all_taxons, taxon_slug=tel_prefix, comparison=taxon.is_comparison_taxon, data_sources=data_sources, taxon_type=taxon_type, aggregation=taxon.aggregation, subrequest_only=subrequest_only, ) except TelExpressionException as error: raise HuskyInvalidTelException(error, taxon.slug)
def map_error_to_field(error: MissingFieldFileError, loaded_models: Dict[str, PanoModel]) -> PanoField: # try to find the field in scanned state model = loaded_models.get(error.model_name) data_type = ValidationType.text if model: # model with this field was scanned so let's try to find this field field = [ model_field for model_field in model.fields if error.field_slug in model_field.field_map ] if len(field) == 1: # exactly this field was scanned so let's determine its correct validation type field_data_type = EnumHelper.from_value_safe( ValidationType, field[0].data_type) if field_data_type: data_type = field_data_type field_type = TaxonTypeEnum.metric if data_type in METRIC_VALIDATION_TYPES else TaxonTypeEnum.dimension if field_type is TaxonTypeEnum.dimension: aggregation = Aggregation(type='group_by', params=None) else: aggregation = Aggregation(type='sum', params=None) return PanoField( slug=error.field_slug, field_type=field_type.value, display_name=error.field_slug, data_source=error.dataset_slug, group='CLI', data_type=data_type.value, aggregation=aggregation, )
def deep_construct(cls: Type[BaseModel], _fields_set: Optional[Set[str]] = None, **values: Any): """ Copied from pydantic BaseModel and modified to be able to construct models recursively from primitive data types and enum values. It can deserialize models inheriting from PydanticModel, including lists of models. WARNING: - Dictionaries and sets are copied without any changes, even if they contain Pydantic models - Invalid enum values are ignored and replaced with None (no exception is thrown) Creates a new model setting __dict__ and __fields_set__ from trusted or pre-validated data. Default values are respected, but no other validation is performed. """ m = cls.__new__(cls) for field_name, field in m.__fields__.items(): field_type = field.type_ if field.shape == SHAPE_LIST: # Lists can have their actual types kinda hidden # Would need change if we have List[Union[TypeA,TypeB]].. but that is quite an edge case and # not sure pydantic even supports that list_field_type = field.sub_fields[0].type_ if (inspect.isclass(list_field_type) and issubclass(list_field_type, PydanticModel) and values.get(field_name) is not None): deserialized_list = [] for model in values[field_name]: if model is not None: deserialized_list.append( list_field_type.deep_construct(**model)) else: deserialized_list.append(None) values[field_name] = deserialized_list if issubclass(type(list_field_type), EnumMeta) and values.get(field_name) is not None: deserialized_list = [] for enum in values[field_name]: if enum is not None: # Deserialize enum and replace invalid values with None, do not throw exception deserialized_list.append( EnumHelper.from_value_safe( list_field_type, enum)) else: deserialized_list.append(None) values[field_name] = deserialized_list elif (inspect.isclass(field_type) and issubclass(field_type, PydanticModel) and values.get(field_name) is not None): values[field_name] = field_type.deep_construct( **values[field_name]) elif issubclass(type(field_type), EnumMeta) and values.get(field_name) is not None: # Deserialize enum and replace invalid values with None, do not throw exception values[field_name] = EnumHelper.from_value_safe( field_type, values[field_name]) object.__setattr__(m, '__dict__', { **deepcopy(cls.__field_defaults__), **values }) if _fields_set is None: _fields_set = set(values.keys()) object.__setattr__(m, '__fields_set__', _fields_set) return m