def test_blending_2_2(self): q1 = Select( columns=[ column('ad_id'), column('impressions'), column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME) ], from_obj=table('table1'), ) df1 = Dataframe( q1, get_mocked_dataframe_columns_map(['ad_id', 'impressions']), set(), {'SF'}) q2 = Select( columns=[ column('ad_id'), column('campaign_id'), column('impressions'), column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME), ], from_obj=table('table2'), ) df2 = Dataframe( q2, get_mocked_dataframe_columns_map( ['ad_id', 'impressions', 'campaign_id']), set(), {'SF'}) blended_df = blend_dataframes(SNOWFLAKE_HUSKY_CONTEXT, [df1, df2]) self.write_test_expectations('query.sql', compile_query(blended_df.query)) expected_query = self.read_test_expectations('query.sql') self.assertEqual(expected_query, compile_query(blended_df.query)) self.assertEqual({'ad_id', 'impressions', 'campaign_id'}, set(blended_df.slug_to_column.keys()))
def test_namespaced_taxons_build_query(self): result, _, effectively_used_models = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self.scope, get_specific_select_mocked_taxons([ 'impressions', 'ad_id', f'{MOCK_DATA_SOURCE_NAME}|dimension', f'{MOCK_DATA_SOURCE_NAME}|metric' ]), get_specific_select_mocked_taxons([ 'impressions', 'ad_id', f'{MOCK_DATA_SOURCE_NAME}|dimension', f'{MOCK_DATA_SOURCE_NAME}|metric' ]), self.graph, 'data-source', ).get_query() self.write_test_expectations('query.sql', compile_query(result)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(result)) self.assertDictEqual( { 'noncached_models': [ 'mock_data_source.metric_model', 'mock_data_source.entity_model' ], }, effectively_used_models.to_primitive(), )
def _run_basic_test(self, projection_taxons, selected_taxons=None): if selected_taxons is None: selected_taxons = projection_taxons query, taxon_model_info_map, _ = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self._scope, get_specific_select_mocked_taxons(selected_taxons), get_specific_select_mocked_taxons(projection_taxons), self._graph, 'data-source', ).get_query() final_dataframe = ProjectionBuilder.query( query, taxon_model_info_map, get_specific_select_mocked_taxons(projection_taxons), 'data-source', None, None, None, ) self.write_test_expectations('query.sql', compile_query(final_dataframe.query)) expected = self.read_test_expectations('query.sql') assert compile_query(final_dataframe.query) == expected
def item_to_primitive(self, item): if isinstance(item, ExprResult): result_dict = dict() result_dict['data_source_formula_templates'] = item.data_source_formula_templates result_dict['dimension_formulas'] = item.dimension_formulas result_dict['pre_formulas'] = item.pre_formulas result_dict['post_formula'] = item.post_formula result_dict['phase'] = item.phase result_dict['override_mappings'] = sorted(list(item.override_mappings)) result_dict['invalid_value'] = item.invalid_value primitive = {k: self.item_to_primitive(v) for k, v in result_dict.items()} return primitive elif isinstance(item, PostFormula): post_formula_dict = {'sql': compile_query(item._sql)} if item.template is not None: template = compile_query(item.template) if template != post_formula_dict['sql']: post_formula_dict['template'] = template if len(item.exclude_slugs) > 0: post_formula_dict['exclude_slugs'] = sorted([self.item_to_primitive(i) for i in item.exclude_slugs]) return post_formula_dict elif isinstance(item, TelExpressionException): return {'exception_class': str(item.__class__), 'api_response_message': str(item)} else: return super().item_to_primitive(item)
def test_query_with_pre_filter_without_filter_taxons(self): filter_clause = TaxonValueFilterClause({ 'type': FilterClauseType.TAXON_VALUE.value, 'taxon': 'ad_name', 'operator': SimpleFilterOperator.LIKE.value, 'value': '%abc%', }) result, _, _ = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self.scope, get_specific_select_mocked_taxons( ['impressions', 'ad_id', filter_clause.taxon]), get_specific_select_mocked_taxons(['impressions', 'ad_id']), self.graph, 'data-source', filter_clause=filter_clause, ).get_query() self.write_test_expectations('query.sql', compile_query(result)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(result).strip())
def test_query_with_pre_taxon_taxon_pre_filter(self): filter_clause = TaxonTaxonFilterClause({ 'type': FilterClauseType.TAXON_VALUE.value, 'taxon': 'spend', 'right_taxon': 'impressions', 'operator': SimpleFilterOperator.EQ.value, }) result, _, _ = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self.scope, get_specific_select_mocked_taxons( ['ad_id'] + [s for s in filter_clause.get_taxon_slugs()]), get_specific_select_mocked_taxons(['ad_id']), self.graph, 'data-source', filter_clause=filter_clause, ).get_query() self.write_test_expectations('query.sql', compile_query(result)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(result).strip())
def test_basic_build_join_query(self): taxons = get_specific_select_mocked_taxons( ['spend', 'impressions', 'ad_id', 'ad_name', 'week_of_year']) result, _, _ = SelectBuilder(SNOWFLAKE_HUSKY_CONTEXT, self.scope, taxons, taxons, self.graph, 'data-source').get_query() self.write_test_expectations('query.sql', compile_query(result)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(result))
def test_render_visitor(inp, expectation): result = ModelTelDialect( unique_object_name=_TMP_MODEL.unique_object_name(SNOWFLAKE_HUSKY_CONTEXT), virtual_data_source=_TMP_MODEL.data_sources[0], model=_TMP_MODEL, ).render(inp, SNOWFLAKE_HUSKY_CONTEXT, {}) assert compile_query(result.sql(SNOWFLAKE_HUSKY_CONTEXT.dialect)) == expectation
def _build_taxon_model_info_map(self, taxons: Dict[TaxonSlugExpression, Taxon], taxon_to_model: Dict[TaxonSlugExpression, HuskyModel]): """ Extract extra information (currently only if scalar/array) about taxons on models Currently it's hardcoded to mark all *_tags taxons as taxons of type array """ taxon_model_info_map = dict() for taxon_slug_expression in taxons: model = taxon_to_model[taxon_slug_expression] taxon_column_selector = self._get_column_accessor_for_taxon_and_model( model, taxon_slug_expression) info = TaxonModelInfo( compile_query(taxon_column_selector, self.ctx.dialect), model.name, model.get_attribute_by_taxon( taxon_slug_expression.slug).quantity_type, ) taxon_model_info_map[taxon_slug_expression.slug] = info for filter_slug, template in self.filter_templates.items(): if filter_slug not in taxon_model_info_map: # If the slug is in the info map, it means it is raw slug, and we dont need to create # sql accessor for it render_params = dict() for used_slug in template.used_taxons: render_params[used_slug] = taxon_model_info_map[ used_slug].taxon_sql_accessor sql_accessor = template.render_formula(**render_params) taxon_model_info_map[filter_slug] = TaxonModelInfo( sql_accessor, None, None) self.taxon_model_info_map = taxon_model_info_map
def assert_result_formulas(actual_result: ExprResult, expected_result: ExprResult): assert (actual_result.data_source_formula_templates == expected_result. data_source_formula_templates), 'data_source_formulas dont match' assert list(map(repr, actual_result.dimension_formulas)) == list( map(repr, expected_result.dimension_formulas) ), 'dimension_formulas dont match' assert list(map(repr, actual_result.pre_formulas)) == list( map(repr, expected_result.pre_formulas)), 'pre_formulas dont match' assert compile_query(actual_result.sql(_DIALECT), _DIALECT) == compile_query( expected_result.sql(_DIALECT), _DIALECT), 'sql dont match' assert actual_result.phase == expected_result.phase, 'phase dont match' assert actual_result.override_mappings == expected_result.override_mappings, 'override mappings dont match' assert actual_result.invalid_value == expected_result.invalid_value, "invalid_value doesn't match"
def result(self, context: TelRootContext) -> TelQueryResult: result = self._value.result(context) data_source_formula_templates = [] value_phase = self._value.phase(context) if value_phase == self._phase: return result if value_phase in [TelPhase.dimension_data_source, TelPhase.any]: if self.used_taxons(context).has_some(): if not self._cached_label: self._cached_label = context.new_label assert 1 == len(self._value.return_data_sources(context)) data_source = cast( str, next(ds for ds in list(self._value.return_data_sources(context)) if ds is not None) ) sql = literal_column(safe_quote_identifier(self._cached_label, context.husky_dialect)) data_source_formula_templates.append( SqlFormulaTemplate( SqlTemplate(compile_query(result.sql, context.husky_dialect)), cast(str, self._cached_label), data_source, cast(Set[str], self._value.template_slugs(context)), ) ) template = sql else: sql = result.sql template = result.template label = self._cached_label or result.label if not self._value.invalid_value(context): return TelQueryResult( sql=sql, dialect=context.husky_dialect, aggregations=result.aggregations, dimension_formulas=result.dimension_formulas, data_source_formula_templates=data_source_formula_templates + result.data_source_formula_templates, label=label, exclude_slugs=result.exclude_slugs, template=template, ) else: return TelQueryResult( sql, dialect=context.husky_dialect, data_source_formula_templates=data_source_formula_templates, label=label, ) else: raise RuntimeError(f'Cannot move to {self._phase} phase from {value_phase}')
def test_basic_build_query_with_pre_filter(self): projected_taxons = ['impressions', 'spend'] pre_filter = TaxonValueFilterClause({ 'type': FilterClauseType.TAXON_VALUE.value, 'taxon': 'ad_name', 'operator': SimpleFilterOperator.LIKE, 'value': 'zombies!', }) selected_taxons = projected_taxons + ['ad_name'] query, taxon_model_info_map, _ = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self._scope, get_specific_select_mocked_taxons(selected_taxons), get_specific_select_mocked_taxons(projected_taxons), self._graph, 'data-source', pre_filter, ).get_query() final_dataframe = ProjectionBuilder.query( query, taxon_model_info_map, get_specific_select_mocked_taxons(projected_taxons), 'data-source', None, None, None, {'context'}, ) self.write_test_expectations('query.sql', compile_query(final_dataframe.query)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(final_dataframe.query))
def test_basic_build_join_query(self): taxons = get_specific_select_mocked_taxons( ['spend', 'gender', 'impressions', 'ad_id', 'ad_name']) result, _, effectively_used_models = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self.scope, taxons, taxons, self.graph, 'data-source').get_query() self.write_test_expectations('query.sql', compile_query(result)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(result)) self.assertDictEqual( { 'noncached_models': [ 'mock_data_source.metric_gender_model', 'mock_data_source.entity_model' ], }, effectively_used_models.to_primitive(), )
def compile_transformation_request(cls, req: TransformRequest, company_id: str) -> Tuple[str, HuskyQueryRuntime]: """ Compiles Transform request to its SQL representation :param req: Input request :param company_id: Company ID :return: SQL and type of dialect """ sorted_fields = sorted(req.requested_fields) # prepare origin description origin = DataRequestOrigin( { 'system': 'FDQ', 'extra': { 'purpose': 'taxonomy.transform.compile', }, } ) # get all used taxons in the request used_taxons_map = fetch_all_used_taxons_map(company_id, sorted_fields) # figure out set of all virtual data sources covered by the taxons in the request used_vds = {taxon.data_source for taxon in used_taxons_map.values() if taxon.data_source} # generate subrequest for each virtual data source # this will allow Husky to push the taxons into relevant subrequests subrequests = [] for vds in sorted(used_vds): subrequest = ApiDataRequest({'scope': {'company_id': company_id}, 'properties': {'data_sources': [vds]}}) subrequests.append(subrequest) # finalize the blending husky request husky_request_dict = {'data_subrequests': subrequests, 'taxons': req.requested_fields, 'origin': origin} husky_request = BlendingDataRequest(husky_request_dict) connection = Connection.get() query_runtime_name = Connection.get_dialect_name(connection) query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, query_runtime_name) context = HuskyQueryContext(query_runtime) husky_dataframe = QueryBuilder.validate_data_request(context, husky_request) # add another layer of query to use correct names final_query = cls._correct_column_aliases(context, husky_dataframe) return compile_query(final_query, context.dialect), context.query_runtime
def test_basic_build_query_with_order_by(self): selected_taxons = ['impressions', 'ad_id'] query, taxon_model_info_map, _ = SelectBuilder( SNOWFLAKE_HUSKY_CONTEXT, self._scope, get_specific_select_mocked_taxons(selected_taxons), get_specific_select_mocked_taxons(selected_taxons), self._graph, 'data-source', ).get_query() taxon_order_1 = TaxonDataOrder({ 'taxon': 'impressions', 'type': TaxonOrderType.desc.value }) taxon_order_2 = TaxonDataOrder({ 'taxon': 'ad_id', 'type': TaxonOrderType.asc.value }) order_by = [taxon_order_1, taxon_order_2] final_dataframe = ProjectionBuilder.query( query, taxon_model_info_map, get_specific_select_mocked_taxons(selected_taxons), 'data-source', order_by, 1, 2, {'context'}, ) self.write_test_expectations('query.sql', compile_query(final_dataframe.query)) expected = self.read_test_expectations('query.sql') self.assertEqual(expected, compile_query(final_dataframe.query))
def test_scope_filters(self, mock__get_taxons, mock__load_models): mock__load_models.return_value = [ get_mock_entity_model(), get_mock_metric_model(), ] request = InternalDataRequest({ 'scope': { 'project_id': 'project', 'company_id': 'company', "preaggregation_filters": { "type": "group", "logical_operator": "AND", "clauses": [ { "type": "taxon_value", "taxon": "account_id", "operator": "=", "value": "595126134331606" }, ], "negate": False, }, }, 'properties': { 'data_sources': ['mock_data_source'] }, 'taxons': ['account_id', 'ad_name'], }) dataframe = QueryBuilder.build_query( SNOWFLAKE_HUSKY_CONTEXT, request, QueryInfo.create(request), preloaded_taxons=TAXON_MAP, ) actual = compile_query(dataframe.query) self.write_test_expectations('query.sql', actual) expected = self.read_test_expectations('query.sql') assert expected == actual self.assertEqual({'mock_data_source.entity_model'}, dataframe.used_model_names)
def taxon_sql_accessor( self, ctx: HuskyQueryContext, taxon_slug: str, cast_array: bool = False, model_tel_dialect: Optional[ModelTelDialect] = None, ) -> str: """ Helper function that returns full sql accessor to given taxon on the model :param ctx: Husky query context :param taxon_slug Original taxon slug :param cast_array Automatically handle arrays by casting them to string (default is False) :param model_tel_dialect Initialized model TEL dialect, if there is one (we use it to check for cyclic reference). """ attribute = self.get_attribute_by_taxon(taxon_slug) # let TEL grammar to render the SQL transformation # on purpose, we dont use 'column' variable here, because we dont really rely on column_name attribute here tel_dialect = model_tel_dialect if tel_dialect is None: # no initialized tel visitor is provided so create a generic one tel_dialect = ModelTelDialect( unique_object_name=self.unique_object_name(ctx), virtual_data_source=self.data_sources[0], model=self, ) # render the TEL transformation parsed_expression = tel_dialect.render(attribute.tel_transformation, ctx, {}) sql_accessor = compile_query(parsed_expression.sql(ctx.dialect), ctx.dialect) # we cast arrays to varchar, if requested if cast_array and attribute.quantity_type is ValueQuantityType.array: sql_accessor = f'CAST({sql_accessor} AS VARCHAR)' return sql_accessor
def render_formula( self, dialect: default.DefaultDialect, dimension_slugs: Optional[Set[TaxonExpressionStr]] = None ) -> ClauseElement: """ Render the final SQL formula by replacing DIMENSION_SLUGS_TEMPLATE_PARAM with the comma separated list of provided dimension_slugs. Any slugs in the _exclude_slugs attribute will not be included. If the final set of dimension slugs is empty, then the unchanged _sql formula is returned instead. """ valid_dimension_slugs = (dimension_slugs or set()).difference(self._exclude_slugs) if self._template is not None and valid_dimension_slugs: template = SqlTemplate(compile_query(self._template, dialect)) template_mapping = { self.DIMENSION_SLUGS_TEMPLATE_PARAM: ', '.join(sorted(valid_dimension_slugs)) } return literal_column(template.substitute(template_mapping)) return self._sql
def test_simple_concat(self, mock__get_taxons, mock__load_models): mock__load_models.return_value = [ get_mock_entity_model(), get_mock_metric_model(), ] request = InternalDataRequest({ 'scope': { 'project_id': 'project-id', 'company_id': 'company-id', "preaggregation_filters": { "type": "taxon_value", "taxon": "account_id", "operator": "=", "value": "abc", }, }, 'properties': { 'data_sources': ['mock_data_source'] }, 'taxons': ['account_id', 'ad_name'], }) dimension_templates = [ SqlFormulaTemplate(SqlTemplate('''concat(${ad_name},'xx')'''), '''__1''', MOCK_DATA_SOURCE_NAME, {'ad_name'}) ] df = QueryBuilder.build_query( SNOWFLAKE_HUSKY_CONTEXT, request, QueryInfo.create(request), preloaded_taxons=TAXON_MAP, dimension_templates=dimension_templates, ) actual = compile_query(df.query) self.write_test_expectations('query.sql', actual) expected = self.read_test_expectations('query.sql') assert expected == actual self.assertEqual({'mock_data_source.entity_model'}, df.used_model_names)
def test_scope_filters(self): scope_filter = TaxonValueFilterClause({ 'type': FilterClauseType.TAXON_VALUE.value, 'taxon': 'account_id', 'operator': SimpleFilterOperator.EQ.value, 'value': '10', }).to_native() scope = Scope( dict(company_id='10', project_id='10', preaggregation_filters=scope_filter)) model = get_mock_metric_model() query = select([literal_column('test')]) model_info = TaxonModelInfo('acc_id_column', model.name, ValueQuantityType.scalar) new_query = ScopeGuard.add_scope_row_filters( SNOWFLAKE_HUSKY_CONTEXT, scope, query, dict(account_id=model_info)) # Not global model, we are fine without scope filters assert compile_query( new_query) == "SELECT test \nWHERE acc_id_column = '10'"
def build_query( ctx: HuskyQueryContext, subrequest: InternalDataRequest, query_info: QueryInfo, preloaded_taxons: TaxonMap, dimension_templates: Optional[List[SqlFormulaTemplate]] = None, filter_templates: Optional[TaxonToTemplate] = None, ) -> Dataframe: """ Returns Query and Taxons obtained in it :param dimension_templates Sql column templates to select :param filter_templates Filter temples keyed by taxon slug, referenced from scope or preagg filters. """ dimension_templates = dimension_templates or [] filter_templates = filter_templates or dict() # Fetch Taxons simple_taxon_manager = SimpleTaxonManager.initialize( subrequest, dimension_templates, filter_templates, preloaded_taxons ) data_sources = set(subrequest.properties.data_sources) if len(subrequest.properties.data_sources) != 1: # Joining across data sources is more complex and not implemented yet. raise MultipleDataSources(data_sources) data_source = subrequest.properties.data_sources[0] models = ModelRetriever.load_models(data_sources, subrequest.scope, subrequest.properties.model_name) # Build Graph graph = GraphBuilder.create_with_models(models) # Create Select Query select_query, taxon_model_info_map, effectively_used_models = SelectBuilder( ctx, subrequest.scope, simple_taxon_manager.graph_select_taxons, simple_taxon_manager.projection_taxons, graph, data_source, subrequest.preaggregation_filters, dimension_templates, filter_templates, ).get_query() query_info.definition = QueryDefinition({'effectively_used_models': effectively_used_models}) logger.debug('Select Query: %s', compile_query(select_query, ctx.dialect)) # Create Projection Query final_dataframe = ProjectionBuilder.query( select_query, taxon_model_info_map, simple_taxon_manager.projection_taxons, subrequest.properties.data_source, subrequest.order_by, subrequest.limit, subrequest.offset, dimension_templates, ) logger.debug('Projection Query: %s', compile_query(final_dataframe.query, ctx.dialect)) return final_dataframe