def test_dag_two_views_same_materialized_address(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         description="table_1 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         description="table_2 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
     )
     with self.assertRaises(ValueError) as e:
         _ = BigQueryViewDagWalker([view_1, view_2])
     self.assertTrue(
         str(e.exception).startswith(
             "Found materialized view address for view [('dataset_2', 'table_2')] "
             "that matches materialized_address of another view: "
             "[('dataset_1', 'table_1')]."
         )
     )
 def test_parse_view_multiple_parents(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="""SELECT * FROM `{project_id}.some_dataset.some_table`
         LEFT OUTER JOIN `{project_id}.some_dataset.other_table`
         USING (some_col);
         """,
     )
     node = BigQueryViewDagNode(view)
     node.set_materialized_addresss({})
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             ),
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="other_table"
                 )
             ),
         },
     )
    def test_union_dags_same_view_different_object(self) -> None:
        view = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )

        unioned_dag = BigQueryViewDagWalker.union_dags(
            BigQueryViewDagWalker([view]),
            BigQueryViewDagWalker(
                [
                    BigQueryView(
                        dataset_id="dataset_1",
                        view_id="table_1",
                        description="table_1 description",
                        should_materialize=True,
                        materialized_address_override=BigQueryAddress(
                            dataset_id="other_dataset_1", table_id="other_table_1"
                        ),
                        view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
                    )
                ]
            ),
        )

        self.assertCountEqual([view], unioned_dag.views)
    def test_dag_parents_materialized_non_default(self) -> None:
        self.maxDiff = None
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_2", table_id="other_table_2"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                SELECT * FROM `{project_id}.dataset_1.table_1`
                JOIN `{project_id}.other_dataset_2.other_table_2`
                USING (col)""",
        )
        walker = BigQueryViewDagWalker([view_1, view_2, view_3])

        def process_simple(
            view: BigQueryView, parent_results: Dict[BigQueryView, DagKey]
        ) -> str:
            if view == view_3:
                # View 3 should have two parents
                self.assertEqual(
                    {view_1: view_1.view_id, view_2: view_2.view_id}, parent_results
                )

            return view.view_id

        result = walker.process_dag(process_simple)
        self.assertEqual(
            {view_1: view_1.view_id, view_2: view_2.view_id, view_3: view_3.view_id},
            result,
        )
def delete_unmanaged_views_and_tables_from_dataset(
    bq_client: BigQueryClient,
    dataset_id: str,
    managed_tables: Set[BigQueryAddress],
    dry_run: bool,
) -> Set[BigQueryAddress]:
    """This function takes in a set of managed views/tables and compares it to the list of
    tables BigQuery has. The function then deletes any views/tables that are in BigQuery but not
    in the set of managed views/tables. It then returns a set of the BigQueryAddress's
    from these unmanaged views/tables that are to be deleted."""
    unmanaged_views_and_tables: Set[BigQueryAddress] = set()
    dataset_ref = bq_client.dataset_ref_for_id(dataset_id)
    if not bq_client.dataset_exists(dataset_ref):
        raise ValueError("Dataset %s does not exist in BigQuery" % dataset_id)
    for table in list(bq_client.list_tables(dataset_id)):
        table_bq_address = BigQueryAddress.from_list_item(table)
        if table_bq_address not in managed_tables:
            unmanaged_views_and_tables.add(table_bq_address)
    for view in unmanaged_views_and_tables:
        if dry_run:
            logging.info(
                "[DRY RUN] Regular run would delete unmanaged table/view %s from dataset %s.",
                view.table_id,
                view.dataset_id,
            )

        else:
            logging.info(
                "Deleting unmanaged table/view %s from dataset %s.",
                view.table_id,
                view.dataset_id,
            )

            bq_client.delete_table(view.dataset_id, view.table_id)
    return unmanaged_views_and_tables
 def test_get_managed_views_for_dataset_map_all_views_same_dataset(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.all_views_same_dataset)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {
             BigQueryAddress(dataset_id="dataset_1", table_id="table_1"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_2"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_3"),
         },
     }
     self.assertEqual(expected_result, result_dict)
 def test_get_managed_views_for_dataset_map_x_shaped_dag(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {BigQueryAddress(dataset_id="dataset_1", table_id="table_1")},
         "dataset_2": {BigQueryAddress(dataset_id="dataset_2", table_id="table_2")},
         "dataset_3": {BigQueryAddress(dataset_id="dataset_3", table_id="table_3")},
         "dataset_4": {BigQueryAddress(dataset_id="dataset_4", table_id="table_4")},
         "dataset_5": {BigQueryAddress(dataset_id="dataset_5", table_id="table_5")},
     }
     self.assertEqual(expected_result, result_dict)
Exemplo n.º 8
0
    def test_dataset_overrides_for_view_builders(self) -> None:
        view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                view_id="my_fake_view",
                description="my_fake_view description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=True,
            ),
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_2",
                view_id="my_fake_view_2",
                description="my_fake_view_2 description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=True,
                materialized_address_override=BigQueryAddress(
                    dataset_id="materialized_dataset", table_id="table_materialized"
                ),
            ),
        ]

        prefix = "my_prefix"
        overrides = dataset_overrides_for_view_builders(prefix, view_builders)

        expected_overrides = {
            "dataset_1": "my_prefix_dataset_1",
            "dataset_2": "my_prefix_dataset_2",
            "materialized_dataset": "my_prefix_materialized_dataset",
        }

        self.assertEqual(expected_overrides, overrides)
 def test_build_unioned_segments_view_with_dataset_overrides(self) -> None:
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
     view_builder = UnionedStateSegmentsViewBuilder(
         config=config,
         table=StateBase.metadata.tables["state_person_external_id"],
         state_codes=[StateCode.US_XX, StateCode.US_WW],
     )
     view = view_builder.build(
         dataset_overrides={
             "state_regional": "my_prefix_state_regional",
             "us_xx_state_regional": "my_prefix_us_xx_state_regional",
             "us_ww_state_regional": "my_prefix_us_ww_state_regional",
         }
     )
     expected_query = (
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_xx_state_regional.state_person_external_id` state_person_external_id\n"
         "UNION ALL\n"
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_ww_state_regional.state_person_external_id` state_person_external_id"
     )
     self.assertEqual(expected_query, view.view_query)
     self.assertEqual(
         BigQueryAddress(
             dataset_id="my_prefix_state_regional",
             table_id="state_person_external_id",
         ),
         view.materialized_address,
     )
 def test_parse_view_materialized_parent(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table_materialized`",
     )
     parent_view = BigQueryView(
         dataset_id="some_dataset",
         view_id="some_table",
         description="my parent view description",
         view_query_template="SELECT * FROM UNNEST([])",
         should_materialize=True,
     )
     node = BigQueryViewDagNode(view)
     if not parent_view.materialized_address:
         raise ValueError("Null materialized_address for view [{parent_view}]")
     node.set_materialized_addresss(
         {parent_view.materialized_address: DagKey.for_view(parent_view)}
     )
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             )
         },
     )
 def parent_tables(self) -> Set[BigQueryAddress]:
     """The set of actual tables/views referenced by this view."""
     parents = re.findall(r"`[\w-]*\.([\w-]*)\.([\w-]*)`",
                          self.view.view_query)
     return {
         BigQueryAddress(dataset_id=candidate[0], table_id=candidate[1])
         for candidate in parents
     }
Exemplo n.º 12
0
    def test_build_view_state_legacy(self) -> None:
        table = one(t for t in StateBase.metadata.sorted_tables
                    if t.name == "state_person")
        view_builder = FederatedCloudSQLTableBigQueryViewBuilder(
            connection_region="us-east2",
            table=table,
            view_id=table.name,
            cloud_sql_query="SELECT * FROM state_person;",
            database_key=SQLAlchemyDatabaseKey.for_state_code(
                StateCode.US_XX,
                db_version=SQLAlchemyStateDatabaseVersion.LEGACY),
            materialized_address_override=BigQueryAddress(
                dataset_id="materialized_dataset",
                table_id="materialized_table"),
        )
        expected_description = """View providing a connection to the [state_person]
table in the [postgres] database in the [STATE] schema. This view is 
managed outside of regular view update operations and the results can be found in the 
schema-specific datasets (`state`, `jails`, `justice_counts`, etc)."""

        expected_view_query = f"""/*{expected_description}*/
SELECT
    *
FROM EXTERNAL_QUERY(
    "test-project.us-east2.state_cloudsql",
    "SELECT * FROM state_person;"
)"""

        # Build without dataset overrides
        view = view_builder.build()

        self.assertIsInstance(view, FederatedCloudSQLTableBigQueryView)
        self.assertEqual(expected_view_query, view.view_query)
        self.assertEqual(expected_description, view.description)
        self.assertEqual(
            BigQueryAddress(
                dataset_id="state_cloudsql_connection",
                table_id="state_person",
            ),
            view.address,
        )
        self.assertEqual(
            BigQueryAddress(dataset_id="materialized_dataset",
                            table_id="materialized_table"),
            view.materialized_address,
        )
Exemplo n.º 13
0
def print_dfs_tree(dataset_id: str,
                   view_id: str,
                   print_downstream_tree: bool = False) -> None:
    dag_walker = build_dag_walker(dataset_id, view_id)
    node = dag_walker.nodes_by_key[DagKey(
        view_address=BigQueryAddress(dataset_id=dataset_id, table_id=view_id))]
    dag_walker.populate_node_family_for_node(node=node)
    print(node.node_family.child_dfs_tree_str
          if print_downstream_tree else node.node_family.parent_dfs_tree_str)
 def test_delete_unmanaged_views_and_tables_dry_run(self) -> None:
     managed_tables = {BigQueryAddress(dataset_id="dataset_1", table_id="table_1")}
     self.mock_client.list_tables.return_value = [
         bigquery.table.TableListItem(self.mock_table_resource_ds_1_table_1),
         bigquery.table.TableListItem(self.mock_table_resource_ds_1_table_2),
     ]
     self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset_ref_ds_1
     self.mock_client.dataset_exists.return_value = True
     expected_deleted_views: Set[BigQueryAddress] = {
         BigQueryAddress(dataset_id="dataset_1", table_id="table_2")
     }
     deleted_views = delete_unmanaged_views_and_tables_from_dataset(
         self.mock_client, "dataset_1", managed_tables, dry_run=True
     )
     self.mock_client.dataset_ref_for_id.assert_called()
     self.mock_client.dataset_exists.assert_called()
     self.mock_client.list_tables.assert_called()
     self.mock_client.delete_table.assert_not_called()
     self.assertEqual(deleted_views, expected_deleted_views)
Exemplo n.º 15
0
    def materialized_address_for_unsegmented_table(
            self, table: Table) -> BigQueryAddress:
        """Returns the dataset that data for a given table in an unsegmented schema is
        materialized into. Throws for state-segmented schemas.
        """
        if self.is_state_segmented_refresh_schema():
            raise ValueError(f"Unexpected schema type [{self.schema_type}]")

        dataset = self.unioned_regional_dataset(dataset_override_prefix=None)
        if self.schema_type == SchemaType.JUSTICE_COUNTS:
            # TODO(#7285): JUSTICE_COUNTS has a custom materialized location for
            #  backwards compatibility. Once we delete the legacy views at
            #  `justice_counts.{table_name}` etc, we will be able to write materialized
            #  tables to that location.
            return BigQueryAddress(
                dataset_id=dataset,
                table_id=f"{table.name}_materialized",
            )
        return BigQueryAddress(dataset_id=dataset, table_id=table.name)
    def test_parse_simple_view(self) -> None:
        view = BigQueryView(
            dataset_id="my_dataset",
            view_id="my_view_id",
            description="my view description",
            view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table`",
        )
        node = BigQueryViewDagNode(view)
        self.assertIsNone(view.materialized_address)
        node.set_materialized_addresss({})
        self.assertEqual(node.is_root, False)
        self.assertEqual(
            node.dag_key,
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="my_dataset", table_id="my_view_id"
                )
            ),
        )
        self.assertEqual(
            node.parent_keys,
            {
                DagKey(
                    view_address=BigQueryAddress(
                        dataset_id="some_dataset", table_id="some_table"
                    )
                )
            },
        )
        self.assertEqual(node.child_keys, set())

        node.is_root = True
        child_key = DagKey(
            view_address=BigQueryAddress(
                dataset_id="other_dataset", table_id="other_table"
            )
        )
        node.add_child_key(child_key)

        self.assertEqual(node.is_root, True)
        self.assertEqual(node.child_keys, {child_key})
    def test_populate_node_family_full_parentage_complex_dependencies(self) -> None:
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_1.table_1`
                           JOIN `{project_id}.dataset_2.table_2`
                           USING (col)""",
        )
        view_4 = BigQueryView(
            dataset_id="dataset_4",
            view_id="table_4",
            description="table_4 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_2.table_2`
                           JOIN `{project_id}.dataset_3.table_3`
                           USING (col)""",
        )

        dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4])
        start_node = dag_walker.node_for_view(view_4)

        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey.for_view(view_1),
            DagKey.for_view(view_2),
            DagKey.for_view(view_3),
        }
        self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
        def _get_one_way_dependencies(
            descendants: bool = False, ) -> Tuple[Set[DagKey], str]:
            """Returns a set of all dependent DagKeys in one direction, and a string
             representation of that tree.

            If |descendants| is True, returns info about the tree of views that are
            dependent on the view. If |descendants| is False, returns info about the
            tree of all views that this view depends on."""
            stack = [(
                DagKey(view_address=BigQueryAddress(
                    dataset_id=node.view.dataset_id,
                    table_id=node.view.table_id)),
                0,
            )]
            tree = ""
            full_dependencies: Set[DagKey] = set()
            while len(stack) > 0:
                dag_key, tabs = stack.pop()
                if not datasets_to_skip or dag_key.dataset_id not in datasets_to_skip:
                    table_name = (custom_node_formatter(dag_key)
                                  if custom_node_formatter else
                                  f"{dag_key.dataset_id}.{dag_key.table_id}")
                    tree += ("|" if tabs else "") + ("--" *
                                                     tabs) + table_name + "\n"

                # Stop if we reached a source view
                if (view_source_table_datasets and not descendants
                        and dag_key.dataset_id in view_source_table_datasets):
                    continue

                curr_node = self.nodes_by_key.get(dag_key)
                if curr_node:
                    next_related_keys = (curr_node.child_keys if descendants
                                         else curr_node.parent_keys)

                    for related_key in sorted(
                            next_related_keys,
                            key=lambda key: (key.dataset_id, key.table_id),
                            reverse=descendants,
                    ):
                        full_dependencies.add(related_key)
                        stack.append((
                            related_key,
                            # We don't add a tab if we are skipping a view
                            tabs if datasets_to_skip
                            and dag_key.dataset_id in datasets_to_skip else
                            tabs + 1,
                        ))
            return full_dependencies, tree
Exemplo n.º 19
0
 def create_mock_bq_table(
     self,
     dataset_id: str,
     table_id: str,
     mock_schema: MockTableSchema,
     mock_data: pd.DataFrame,
 ) -> None:
     postgres_table_name = self.register_bq_address(
         address=BigQueryAddress(dataset_id=dataset_id, table_id=table_id))
     mock_data.to_sql(
         name=postgres_table_name,
         con=self.postgres_engine,
         dtype=mock_schema.data_types,
         index=False,
     )
Exemplo n.º 20
0
    def materialized_address_for_segment_table(
        self,
        table: Table,
        state_code: StateCode,
    ) -> BigQueryAddress:
        """Returns the dataset that data for a given table in a state segment is
        materialized into. Throws for unsegmented schemas.
        """
        if not self.is_state_segmented_refresh_schema():
            raise ValueError(f"Unexpected schema type [{self.schema_type}]")

        return BigQueryAddress(
            dataset_id=self.materialized_dataset_for_segment(state_code),
            table_id=table.name,
        )
 def test_delete_unmanaged_views_and_tables_dataset_doesnt_exist(self) -> None:
     managed_tables = {BigQueryAddress(dataset_id="dataset_1", table_id="table_1")}
     self.mock_client.list_tables.return_value = [
         bigquery.table.TableListItem(self.mock_table_resource_ds_1_table_1),
         bigquery.table.TableListItem(self.mock_table_resource_ds_1_table_2),
     ]
     self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset_ref_ds_1
     self.mock_client.dataset_exists.return_value = False
     with self.assertRaises(ValueError):
         delete_unmanaged_views_and_tables_from_dataset(
             self.mock_client, "dataset_bogus", managed_tables, dry_run=False
         )
     self.mock_client.dataset_ref_for_id.assert_called()
     self.mock_client.dataset_exists.assert_called()
     self.mock_client.list_tables.assert_not_called()
     self.mock_client.delete_table.assert_not_called()
    def assertIsValidEmptyParentsView(self, node: BigQueryViewDagNode) -> None:
        """Fails the test if a view that has no parents is an expected view with no
        parents. Failures could be indicative of poorly formed view queries.
        """
        known_empty_parent_view_addresss = {
            # These views unnest data from a static list
            BigQueryAddress(
                dataset_id="census_managed_views",
                table_id="charge_class_severity_ranks",
            ),
            BigQueryAddress(
                dataset_id="analyst_data",
                table_id="admission_start_reason_dedup_priority",
            ),
            BigQueryAddress(
                dataset_id="analyst_data",
                table_id="release_termination_reason_dedup_priority",
            ),
            BigQueryAddress(
                dataset_id="analyst_data", table_id="violation_type_dedup_priority"
            ),
            BigQueryAddress(
                dataset_id="analyst_data", table_id="supervision_level_dedup_priority"
            ),
            BigQueryAddress(
                dataset_id="analyst_data", table_id="compartment_level_2_dedup_priority"
            ),
            # Generate data using pure date functions
            BigQueryAddress(
                dataset_id="reference_views", table_id="covid_report_weeks"
            ),
            BigQueryAddress(
                dataset_id="population_projection_data", table_id="simulation_run_dates"
            ),
        }
        if node.dag_key.view_address in known_empty_parent_view_addresss:
            return

        if "FROM EXTERNAL_QUERY" in node.view.view_query:
            return

        self.fail(node.dag_key)
    def _get_all_config_view_addresses_for_product(
        product: ProductConfig, ) -> Set[BigQueryAddress]:
        """Returns a set containing a BQ address for each view listed by each export
        necessary for the given product."""
        all_config_view_addresses: Set[BigQueryAddress] = set()
        for export in product.exports:
            collection_config = VIEW_COLLECTION_EXPORT_INDEX[export]
            view_builders = collection_config.view_builders_to_export

            all_config_view_addresses = all_config_view_addresses.union({
                BigQueryAddress(
                    dataset_id=view_builder.dataset_id,
                    table_id=view_builder.view_id,
                )
                for view_builder in view_builders
            })

        return all_config_view_addresses
Exemplo n.º 24
0
 def test_materialized_address_override_same_as_view_throws(self) -> None:
     with self.assertRaises(ValueError) as e:
         _ = BigQueryView(
             dataset_id="view_dataset",
             view_id="my_view",
             description="my_view description",
             should_materialize=True,
             materialized_address_override=BigQueryAddress(
                 dataset_id="view_dataset", table_id="my_view"),
             view_query_template=
             "SELECT * FROM `{project_id}.{some_dataset}.table`",
             some_dataset="a_dataset",
         )
     self.assertEqual(
         str(e.exception),
         "Materialized address override "
         "[BigQueryAddress(dataset_id='view_dataset', table_id='my_view')] cannot be "
         "same as view itself.",
     )
Exemplo n.º 25
0
 def test_materialized_address_override_no_should_materialize_throws(
         self) -> None:
     with self.assertRaises(ValueError) as e:
         _ = BigQueryView(
             dataset_id="view_dataset",
             view_id="my_view",
             description="my_view description",
             materialized_address_override=BigQueryAddress(
                 dataset_id="view_dataset_materialized",
                 table_id="my_view_table",
             ),
             view_query_template=
             "SELECT * FROM `{project_id}.{some_dataset}.table`",
             some_dataset="a_dataset",
         )
     self.assertTrue(
         str(e.exception).startswith(
             "Found nonnull materialized_address_override ["
             "BigQueryAddress(dataset_id='view_dataset_materialized', table_id='my_view_table')] "
             "when `should_materialize` is not True"))
Exemplo n.º 26
0
    def _rewrite_table_references(self, query: str) -> str:
        """Maps BQ table references to the underlying Postgres tables"""
        table_reference_regex = re.compile(
            r"`[\w-]+\.(?P<dataset_id>[\w-]+)\.(?P<table_id>[\w-]+)`")
        for match in re.finditer(table_reference_regex, query):
            table_reference = match.group()
            dataset_id, table_id = match.groups()
            dataset_match = re.match(r"(us_[a-z]{2})_raw_data", dataset_id)
            if dataset_match:
                dataset_id = dataset_match.group(1)  # region_code
                table_id = table_id.lower()

            location = BigQueryAddress(dataset_id=dataset_id,
                                       table_id=table_id)
            if location not in self.mock_bq_to_postgres_tables:
                raise KeyError(
                    f"BigQuery location [{location}] not properly registered - must be "
                    f"created via create_mock_bq_table.")
            query = query.replace(table_reference,
                                  self.mock_bq_to_postgres_tables[location])
        return query
Exemplo n.º 27
0
    def __init__(
        self,
        *,
        config: CloudSqlToBQConfig,
        table: Table,
        state_codes: List[StateCode],
    ):
        if not config.is_state_segmented_refresh_schema():
            raise ValueError(
                f"Unexpected schema type [{config.schema_type.name}]")

        self.config = config
        self.table = table
        self.state_codes = state_codes
        # Dataset prefixing will ge handled automatically by view building logic
        self.dataset_id = config.unioned_regional_dataset(
            dataset_override_prefix=None)
        self.view_id = f"{table.name}_view"
        self.materialized_address_override = BigQueryAddress(
            dataset_id=self.dataset_id,
            table_id=table.name,
        )
Exemplo n.º 28
0
    # second, iterate over Sessions tables
    for view_builder in SESSIONS_SOURCE_VIEW_BUILDERS:

        # ensure view_builder has a defined materialized address
        if view_builder.materialized_address is None:
            raise ValueError(
                f"Materialized address not defined for view: {view_builder.view_id}"
            )

        # append each unique view to the builder
        PARTNER_SHARED_SESSIONS_VIEW_BUILDERS.append(
            SimpleBigQueryViewBuilder(
                dataset_id=EXTERNALLY_SHARED_VIEWS_DATASET,
                view_id=view_prefix + view_builder.view_id,
                view_query_template=SESSIONS_QUERY_TEMPLATE,
                description=view_builder.description,
                origin_dataset_id=view_builder.dataset_id,
                origin_table_id=view_builder.materialized_address.table_id,
                allowed_states=str(allowed_states),
                should_materialize=True,
                materialized_address_override=BigQueryAddress(
                    dataset_id=destination_dataset_id,
                    table_id=view_builder.materialized_address.table_id,
                ),
            ))

if __name__ == "__main__":
    with local_project_id_override(GCP_PROJECT_STAGING):
        for view_builder in PARTNER_SHARED_SESSIONS_VIEW_BUILDERS:
            view_builder.build_and_print()
Exemplo n.º 29
0
    
    SELECT
      *,
      ROUND(IEEE_DIVIDE(recidivated_releases, releases), 2) as recidivism_rate
    FROM
      recidivism_numbers
    UNION ALL
    SELECT
      *
    FROM
      pa_recidivism
    ORDER BY state_code, release_cohort, followup_years, gender, age_bucket, race_or_ethnicity
    """

# TODO(#7373): Manage this table automatically.
PA_RECIDIVISM_ADDRESS = BigQueryAddress(dataset_id="us_pa_supplemental",
                                        table_id="recidivism")

RECIDIVISM_RATES_BY_COHORT_BY_YEAR_VIEW_BUILDER = MetricBigQueryViewBuilder(
    dataset_id=dataset_config.PUBLIC_DASHBOARD_VIEWS_DATASET,
    view_id=RECIDIVISM_RATES_BY_COHORT_BY_YEAR_VIEW_NAME,
    view_query_template=RECIDIVISM_RATES_BY_COHORT_BY_YEAR_VIEW_QUERY_TEMPLATE,
    dimensions=(
        "state_code",
        "release_cohort",
        "followup_years",
        "gender",
        "age_bucket",
        "race_or_ethnicity",
    ),
    description=RECIDIVISM_RATES_BY_COHORT_BY_YEAR_VIEW_DESCRIPTION,
    materialized_metrics_dataset=dataset_config.
Exemplo n.º 30
0
    def test_materialized_address_dataset_overrides(self) -> None:
        dataset_overrides = {
            "view_dataset": "my_override_view_dataset",
            "other_dataset": "my_override_other_dataset",
        }

        view_materialized_no_override = BigQueryView(
            dataset_id="view_dataset",
            view_id="my_view",
            description="my_view description",
            should_materialize=True,
            dataset_overrides=dataset_overrides,
            view_query_template=
            "SELECT * FROM `{project_id}.{some_dataset}.table`",
            some_dataset="a_dataset",
        )

        self.assertEqual(
            BigQueryAddress(dataset_id="my_override_view_dataset",
                            table_id="my_view_materialized"),
            view_materialized_no_override.materialized_address,
        )
        self.assertEqual(
            BigQueryAddress(dataset_id="my_override_view_dataset",
                            table_id="my_view_materialized"),
            view_materialized_no_override.table_for_query,
        )

        view_with_override = BigQueryView(
            dataset_id="view_dataset",
            view_id="my_view",
            description="my_view description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset",
                table_id="my_view_table",
            ),
            dataset_overrides=dataset_overrides,
            view_query_template=
            "SELECT * FROM `{project_id}.{some_dataset}.table`",
            some_dataset="a_dataset",
        )

        self.assertEqual(
            BigQueryAddress(dataset_id="my_override_other_dataset",
                            table_id="my_view_table"),
            view_with_override.materialized_address,
        )
        self.assertEqual(
            BigQueryAddress(dataset_id="my_override_other_dataset",
                            table_id="my_view_table"),
            view_with_override.table_for_query,
        )

        view_not_materialized = BigQueryView(
            dataset_id="view_dataset",
            view_id="my_view",
            description="my_view description",
            dataset_overrides=dataset_overrides,
            view_query_template=
            "SELECT * FROM `{project_id}.{some_dataset}.table`",
            some_dataset="a_dataset",
        )

        self.assertIsNone(view_not_materialized.materialized_address)
        self.assertEqual(
            BigQueryAddress(dataset_id="my_override_view_dataset",
                            table_id="my_view"),
            view_not_materialized.table_for_query,
        )