def detect_joins_task(detect_joins_job): try: logger.info( f'job_id={detect_joins_job.job_id} Fetching models for vds {detect_joins_job.virtual_data_source} ' f'under company {detect_joins_job.company_id}' ) husky_models = ModelRetriever.load_models( {detect_joins_job.virtual_data_source}, Scope(company_id=detect_joins_job.company_id) ) models = [FdqModelMapper.from_internal(husky_model) for husky_model in husky_models] logger.info( f'job_id={detect_joins_job.job_id} Running join detection for {detect_joins_job.virtual_data_source} ' f'under company {detect_joins_job.company_id}' ) detected_joins = detect_joins(models=models) detect_joins_job.joins = detected_joins detect_joins_job.status = 'COMPLETED' logger.info( f'Joins for {detect_joins_job.virtual_data_source} ' f'under company {detect_joins_job.company_id} detected sucessfully job_id={detect_joins_job.job_id} ' ) except Exception: detect_joins_job.status = 'FAILED' logger.error( f'Failed detecting joins for {detect_joins_job.virtual_data_source} ' f'under company {detect_joins_job.company_id} job_id={detect_joins_job.job_id} ' ) raise # Let the celery handler report the failure
def test_filters_data_sources(self, _retriever_mock): scope = Scope(dict(company_id='company_2', project_id='project_2')) models = ModelRetriever.load_models({'some-other-source'}, scope) assert len(models) == 1 assert models[0] == next(x for x in mock_response if x.name == 'some_other_model')
def test_includes_generally_available_models(self, _retriever_mock): scope = Scope(dict(company_id='company_2', project_id='project_2')) models = ModelRetriever.load_models(set(), scope) model_names = {x.name for x in models} assert all([x.visibility is ModelVisibility.available for x in models]) assert 'an-invalid-model-that-shouldnt-be-displayed' not in model_names assert len(models) == 5
def test_specific_model_name(self, _retriever_mock): scope = Scope(dict(company_id='company_2', project_id='project_2')) models = ModelRetriever.load_models(set(), scope, 'specific_snap_model') assert len(models) == 1 assert models[0].name == 'specific_snap_model'
def test_filters_on_scope(self, _retriever_mock): scope = Scope(dict(company_id='company_1', project_id='project_1')) models = ModelRetriever.load_models(set(), scope) assert len(models) == 2 assert mock_response[0] in models assert mock_response[2] in models assert next(x for x in mock_response if x.name == 'company_wide_model')
def test_includes_experimental_if_scope_asks_for_them( self, _retriever_mock): scope = Scope( dict( company_id='company_2', project_id='project_2', model_visibility=ModelVisibility.experimental, )) models = ModelRetriever.load_models(set(), scope) model_names = {model.name for model in models} assert len( models) == 6, 'All available and experimental models are visible' assert 'an-experimental-model' in model_names
def test_model_augment(self, _retriever_mock): scope = Scope(dict(company_id='company_2', project_id='project_2')) models = ModelRetriever.load_models(set(), scope, 'specific_snap_model') assert len(models) == 1 model = models[0] assert model.name == 'specific_snap_model' assert model.get_attribute_by_taxon( 'data_source').taxon == 'data_source' assert model.get_attribute_by_taxon('date_hour').taxon == 'date_hour'
def build_query( ctx: HuskyQueryContext, subrequest: InternalDataRequest, query_info: QueryInfo, preloaded_taxons: TaxonMap, dimension_templates: Optional[List[SqlFormulaTemplate]] = None, filter_templates: Optional[TaxonToTemplate] = None, ) -> Dataframe: """ Returns Query and Taxons obtained in it :param dimension_templates Sql column templates to select :param filter_templates Filter temples keyed by taxon slug, referenced from scope or preagg filters. """ dimension_templates = dimension_templates or [] filter_templates = filter_templates or dict() # Fetch Taxons simple_taxon_manager = SimpleTaxonManager.initialize( subrequest, dimension_templates, filter_templates, preloaded_taxons ) data_sources = set(subrequest.properties.data_sources) if len(subrequest.properties.data_sources) != 1: # Joining across data sources is more complex and not implemented yet. raise MultipleDataSources(data_sources) data_source = subrequest.properties.data_sources[0] models = ModelRetriever.load_models(data_sources, subrequest.scope, subrequest.properties.model_name) # Build Graph graph = GraphBuilder.create_with_models(models) # Create Select Query select_query, taxon_model_info_map, effectively_used_models = SelectBuilder( ctx, subrequest.scope, simple_taxon_manager.graph_select_taxons, simple_taxon_manager.projection_taxons, graph, data_source, subrequest.preaggregation_filters, dimension_templates, filter_templates, ).get_query() query_info.definition = QueryDefinition({'effectively_used_models': effectively_used_models}) logger.debug('Select Query: %s', compile_query(select_query, ctx.dialect)) # Create Projection Query final_dataframe = ProjectionBuilder.query( select_query, taxon_model_info_map, simple_taxon_manager.projection_taxons, subrequest.properties.data_source, subrequest.order_by, subrequest.limit, subrequest.offset, dimension_templates, ) logger.debug('Projection Query: %s', compile_query(final_dataframe.query, ctx.dialect)) return final_dataframe
def test_visibility(self, _retriever_mock): from panoramic.cli.husky.service.utils.exceptions import ModelNotFoundException with self.assertRaises(ModelNotFoundException): scope = Scope(dict(company_id='company_2', project_id='project_2')) ModelRetriever.load_models({'another-special-data-source'}, scope)