Пример #1
0
 def __init__(self,
              *,
              project_id: str = None,
              region_code: str,
              view_id: str,
              view_query_template: str,
              raw_file_config: DirectIngestRawFileConfig):
     if not raw_file_config.primary_key_cols:
         raise ValueError(
             f'Empty primary key list in raw file config with tag [{raw_file_config.file_tag}] during '
             f'construction of DirectIngestRawDataTableBigQueryView')
     view_dataset_id = f'{region_code.lower()}_raw_data_up_to_date_views'
     raw_table_dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
         region_code)
     except_clause = self._except_clause_for_config(raw_file_config)
     datetime_cols_clause = self._datetime_cols_clause_for_config(
         raw_file_config)
     supplemental_order_by_clause = self._supplemental_order_by_clause_for_config(
         raw_file_config)
     super().__init__(
         project_id=project_id,
         dataset_id=view_dataset_id,
         view_id=view_id,
         view_query_template=view_query_template,
         raw_table_dataset_id=raw_table_dataset_id,
         raw_table_name=raw_file_config.file_tag,
         raw_table_primary_key_str=raw_file_config.primary_key_str,
         except_clause=except_clause,
         datetime_cols_clause=datetime_cols_clause,
         supplemental_order_by_clause=supplemental_order_by_clause)
Пример #2
0
 def __init__(self,
              *,
              project_id: str = None,
              region_code: str,
              view_id: str,
              view_query_template: str,
              raw_file_config: DirectIngestRawFileConfig):
     view_dataset_id = f'{region_code.lower()}_raw_data_up_to_date_views'
     raw_table_dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
         region_code)
     except_clause = self._except_clause_for_config(raw_file_config)
     datetime_cols_clause = self._datetime_cols_clause_for_config(
         raw_file_config)
     supplemental_order_by_clause = self._supplemental_order_by_clause_for_config(
         raw_file_config)
     super().__init__(
         project_id=project_id,
         dataset_id=view_dataset_id,
         view_id=view_id,
         view_query_template=view_query_template,
         raw_table_dataset_id=raw_table_dataset_id,
         raw_table_name=raw_file_config.file_tag,
         raw_table_primary_key_str=raw_file_config.primary_key_str,
         except_clause=except_clause,
         datetime_cols_clause=datetime_cols_clause,
         supplemental_order_by_clause=supplemental_order_by_clause)
Пример #3
0
def compare_raw_data_between_projects(
    region_code: str,
    source_project_id: str = environment.GCP_PROJECT_STAGING,
    comparison_project_id: str = environment.GCP_PROJECT_PRODUCTION,
) -> List[str]:
    """Compares the raw data between staging and production for a given region."""
    logging.info(
        "**** Ensuring all raw data for [%s] in [%s] also exists in [%s] ****",
        region_code.upper(),
        source_project_id,
        comparison_project_id,
    )

    raw_file_config = DirectIngestRegionRawFileConfig(region_code)

    bq_client = BigQueryClientImpl(project_id=source_project_id)
    dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
        region_code)
    source_dataset = bq_client.dataset_ref_for_id(dataset_id)

    query_jobs: Dict[str, bigquery.QueryJob] = {}
    for file_tag, file_config in raw_file_config.raw_file_configs.items():
        if (not bq_client.table_exists(source_dataset, file_tag)
                or file_config.is_undocumented
                or not file_config.primary_key_cols):
            continue

        columns = ", ".join(
            [column.name for column in file_config.available_columns])

        query_job = bq_client.run_query_async(
            query_str=COMPARISON_TEMPLATE.format(
                source_project_id=source_project_id,
                comparison_project_id=comparison_project_id,
                raw_data_dataset_id=dataset_id,
                raw_data_table_id=file_tag,
                columns=columns,
            ))
        query_jobs[file_tag] = query_job

    table_column_width = min(
        max(len(tag) for tag in raw_file_config.raw_file_configs), 30)

    failed_tables: List[str] = []
    for file_tag in sorted(raw_file_config.raw_file_tags):
        justified_name = file_tag.ljust(table_column_width)

        if file_tag not in query_jobs:
            # This file did not exist in the project that is the source of truth.
            continue

        query_job = query_jobs[file_tag]
        try:
            rows = query_job.result()
        except exceptions.NotFound:
            logging.warning(
                "%s | Missing table %s.%s.%s",
                justified_name,
                comparison_project_id,
                dataset_id,
                file_tag,
            )
            failed_tables.append(file_tag)
            continue

        counts: List[Tuple[datetime.datetime,
                           int]] = [row.values() for row in rows]

        if counts:
            logging.warning(
                "%s | Missing data in the %s table",
                justified_name,
                comparison_project_id,
            )
            for update_datetime, num_missing in counts:
                logging.warning("\t%ss: %d", update_datetime.isoformat(),
                                num_missing)
            failed_tables.append(file_tag)
        else:
            logging.info(
                "%s | %s contains all of the data from %s",
                justified_name,
                comparison_project_id,
                source_project_id,
            )

    return failed_tables