def test_import_wrong_separator_cols_do_not_parse(self) -> None:
        file_config = self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"]
        updated_file_config = attr.evolve(file_config, separator="#")
        self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"] = updated_file_config

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertTrue(
            str(e.exception).startswith(
                "Found only one column: [COL1__COL2_COL3]. "
                "Columns likely did not parse properly."))
    def test_unexpected_file(self):
        # Only file is out of order
        path = self._normalized_path_for_filename(
            'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
            self._DAY_1_TIME_1)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            path,
                                            has_fixture=False)

        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

        next_job_args = self.prioritizer.get_next_job_args()
        self.assertIsNotNone(next_job_args)
        self.assertEqual(next_job_args.file_path, path)
        self.assertFalse(
            self.prioritizer.are_next_args_expected(next_job_args))

        # ... job runs eventually even though unexpected...

        self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagA
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
    def test_import_bq_file_with_raw_file_invalid_column_chars(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagInvalidCharacters.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagInvalidCharacters',
            destination_table_schema=[
                bigquery.SchemaField('COL_1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_3COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_4_COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_files_on_multiple_days(self):
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_2_TIME_1),

            # This file shouldn't ge tpicked up
            self._normalized_path_for_filename(
                'tagC.csv', GcsfsDirectIngestFileType.RAW_DATA,
                self._DAY_1_TIME_3)
        ]
        for path in paths:
            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                path,
                                                has_fixture=False)

        # Exclude last raw file
        expected_processed_paths = paths[0:-1]

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(
            expected_processed_paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))
    def test_files_on_multiple_days(self) -> None:
        paths = [
            self._normalized_path_for_filename(
                "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                "tagB.csv", GcsfsDirectIngestFileType.RAW_DATA,
                self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                "tagA.csv", GcsfsDirectIngestFileType.UNSPECIFIED,
                self._DAY_2_TIME_1),
        ]
        for path in paths:
            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                path,
                                                has_fixture=False)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))
Exemplo n.º 6
0
 def test_run_full_ingest_all_files(self):
     file_tags = sorted(self.controller.get_file_tag_rank_list())
     file_path = path_for_fixture_file(
         self.controller, 'VERABrazosJailData_01012019_115703.csv', False)
     fixture_util.add_direct_ingest_path(self.controller.fs.gcs_file_system,
                                         file_path)
     process_task_queues(self, self.controller, file_tags)
    def test_get_unprocessed_raw_files_to_import(self) -> None:
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_second.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            raw_unprocessed,
                                            has_fixture=False)
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            ingest_view_unprocessed,
                                            has_fixture=False)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())
    def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagInvalidCharacters.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagInvalidCharacters",
            destination_table_schema=[
                bigquery.SchemaField("COL_1", "STRING", "NULLABLE"),
                bigquery.SchemaField("_COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("_3COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
Exemplo n.º 9
0
 def export_query_results_to_cloud_storage(
         self, export_configs: List[ExportQueryConfig]) -> None:
     for export_config in export_configs:
         export_path = GcsfsFilePath.from_absolute_path(
             export_config.output_uri)
         fixture_util.add_direct_ingest_path(self.fs, export_path)
         self.exported_file_tags.append(
             filename_parts_from_path(export_path).file_tag)
    def run_parse_file_test(self, expected: IngestInfo,
                            fixture_file_name: str) -> IngestInfo:
        """Runs a test that reads and parses a given fixture file. Returns the
        parsed IngestInfo object for tests to run further validations."""
        args = ingest_args_for_fixture_file(self.controller,
                                            f"{fixture_file_name}.csv")

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=fixture_file_name,
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            fixture_util.add_direct_ingest_path(
                self.controller.fs.gcs_file_system,
                args.file_path,
                region_code=self.controller.region_code(),
            )

        # pylint:disable=protected-access
        fixture_contents_handle = self.controller._get_contents_handle(args)

        if fixture_contents_handle is None:
            self.fail("fixture_contents_handle should not be None")
        final_info = self.controller._parse(args, fixture_contents_handle)

        print_visible_header_label("FINAL")
        print(final_info)

        print_visible_header_label("EXPECTED")
        print(expected)

        self.assertEqual(expected, final_info)

        return final_info
    def test_single_expected_file(self):
        path = self._normalized_path_for_filename(
            'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
            self._DAY_1_TIME_1)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            path,
                                            has_fixture=False)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order([path])

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagB
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
    def test_import_bq_file_with_raw_file_normalization_conflict(self):
        with self.assertRaises(ValueError) as e:
            file_path = path_for_fixture_file_in_test_gcs_directory(
                directory=self.ingest_directory_path,
                filename='tagNormalizationConflict.csv',
                should_normalize=True,
                file_type=GcsfsDirectIngestFileType.RAW_DATA)

            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                file_path)

            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(
            str(e.exception),
            "Multiple columns with name [_4COL] after normalization.")
    def test_files_on_multiple_days_with_gap(self) -> None:
        """Runs a test where there are files on multiple days and there is a gap
        in the expected files for the first day.
        """
        paths = [
            self._normalized_path_for_filename(
                "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_2_TIME_1),
        ]
        for path in paths:
            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                path,
                                                has_fixture=False)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            if next_job_args is None:
                self.fail("Next job args unexpectedly None")
            self.assertEqual(next_job_args.file_path, path)

            are_args_expected = self.prioritizer.are_next_args_expected(
                next_job_args)
            if i == 0:
                self.assertFalse(are_args_expected)
            else:
                self.assertTrue(are_args_expected)

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))
    def test_import_bq_file_multiple_chunks_uneven_division(self) -> None:

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_multiple_files_times_out_of_order(self) -> None:
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename(
                "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_3),
        ]
        for path in paths:
            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                path,
                                                has_fixture=False)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            if next_job_args is None:
                self.fail("Next job args unexpectedly None")
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = self.prioritizer.are_more_jobs_expected_for_day(
                date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        mock_query_jobs = [
            mock.MagicMock(),
            mock.MagicMock(),
        ]

        self.mock_big_query_client.run_query_async.side_effect = mock_query_jobs

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str="UPDATE `recidiviz-456.us_xx_raw_data.tagC` original\n"
                "SET COL1 = updates.new__COL1\n"
                "FROM (SELECT * FROM UNNEST([\n"
                "    STRUCT('123' AS COL1, CAST('2020-06-10T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1),\n"
                "    STRUCT('123' AS COL1, CAST('2020-09-21T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1)\n"
                "])) updates\n"
                "WHERE original.COL1 = updates.COL1 AND original.update_datetime = updates.update_datetime;"
            ),
            mock.call(
                query_str="DELETE FROM `recidiviz-456.us_xx_raw_data.tagC`\n"
                "WHERE STRUCT(COL1) IN (\n"
                "    STRUCT('789')\n"
                ");"),
        ])

        for mock_query_job in mock_query_jobs:
            mock_query_job.result.assert_called_once()
Exemplo n.º 17
0
def add_paths_with_tags(controller: GcsfsDirectIngestController,
                        file_tags: List[str],
                        pre_normalize_filename: bool = False,
                        file_type=GcsfsDirectIngestFileType.UNSPECIFIED):
    if not isinstance(controller.fs.gcs_file_system, FakeGCSFileSystem):
        raise ValueError(f"Controller fs must have type "
                         f"FakeGCSFileSystem. Found instead "
                         f"type [{type(controller.fs.gcs_file_system)}]")

    for file_tag in file_tags:
        file_path = path_for_fixture_file(
            controller,
            f'{file_tag}.csv',
            should_normalize=pre_normalize_filename,
            file_type=file_type)
        # Only get a fixture path if it is a file, if it is a directory leave it as None
        fixture_util.add_direct_ingest_path(controller.fs.gcs_file_system,
                                            file_path)
        time.sleep(.05)
    def test_import_bq_file_with_row_extra_columns(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagRowExtraColumns.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaisesRegex(ParserError,
                                    "Expected 4 fields in line 3, saw 5"):
            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(0, len(self.fs.gcs_file_system.uploaded_paths))
        self._check_no_temp_files_remain()
    def test_import_bq_file_multiple_chunks_uneven_division(self):

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(source_uri=uploaded_path.uri(),
                 destination_dataset_ref=bigquery.DatasetReference(
                     self.project_id, 'us_xx_raw_data'),
                 destination_table_id='tagPipeSeparatedNonUTF8',
                 destination_table_schema=[
                     bigquery.SchemaField('PRIMARY_COL1', 'STRING',
                                          'NULLABLE'),
                     bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                     bigquery.SchemaField('update_datetime', 'DATETIME',
                                          'REQUIRED')
                 ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_run_multiple_copies_of_same_tag(self):
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagA_2.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW,
                self._DAY_1_TIME_3),
        ]
        for path in paths:
            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                path,
                                                has_fixture=False)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
    def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagDoubleDaggerWINDOWS1252.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagDoubleDaggerWINDOWS1252",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
Exemplo n.º 22
0
def add_paths_with_tags(
    controller: BaseDirectIngestController,
    file_tags: List[str],
    pre_normalized_file_type: Optional[GcsfsDirectIngestFileType] = None,
) -> None:
    if not isinstance(controller.fs.gcs_file_system, FakeGCSFileSystem):
        raise ValueError(f"Controller fs must have type "
                         f"FakeGCSFileSystem. Found instead "
                         f"type [{type(controller.fs.gcs_file_system)}]")

    for file_tag in file_tags:
        file_path = path_for_fixture_file(
            controller,
            f"{file_tag}.csv",
            should_normalize=bool(pre_normalized_file_type),
            file_type=pre_normalized_file_type,
        )
        # Only get a fixture path if it is a file, if it is a directory leave it as None
        fixture_util.add_direct_ingest_path(
            controller.fs.gcs_file_system,
            file_path,
            region_code=controller.region_code(),
        )
        time.sleep(0.05)
    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str=
                f"UPDATE `recidiviz-456.us_xx_raw_data.tagC` SET COL1 = '456' WHERE COL1 = '123' AND update_datetime = '{file_datetime.isoformat()}';"
            ),
            mock.call(
                query_str=
                "DELETE FROM `recidiviz-456.us_xx_raw_data.tagC` WHERE COL1 = '789';"
            ),
        ])
Exemplo n.º 24
0
 def test_run_full_ingest_all_files(self):
     file_tags = sorted(self.controller.get_file_tag_rank_list())
     file_path = path_for_fixture_file(
         self.controller, 'MDC_VERA_20200303_01.csv', False)
     fixture_util.add_direct_ingest_path(self.controller.fs.gcs_file_system, file_path)
     process_task_queues(self, self.controller, file_tags)
    def fully_process_file(self,
                           dt: datetime.datetime,
                           path: GcsfsFilePath,
                           file_type_differentiation_on: bool = False) -> None:
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            path,
                                            has_fixture=False)

        start_num_total_files = len(self.fs.gcs_file_system.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = self.fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '', None)
        start_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '', None)
        if file_type_differentiation_on:
            start_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            start_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            start_raw_storage_paths = []
            start_ingest_view_storage_paths = []

        # File is renamed to normalized path
        file_type = GcsfsDirectIngestFileType.RAW_DATA \
            if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED

        self.fs.mv_path_to_normalized_path(path, file_type, dt)

        if file_type_differentiation_on:
            raw_unprocessed = self.fs.get_unprocessed_file_paths(
                self.INGEST_DIR_PATH,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            self.assertEqual(len(raw_unprocessed), 1)
            self.assertTrue(
                self.fs.is_seen_unprocessed_file(raw_unprocessed[0]))

            # ... raw file imported to BQ

            processed_path = self.fs.mv_path_to_processed_path(
                raw_unprocessed[0])

            processed = self.fs.get_processed_file_paths(
                self.INGEST_DIR_PATH, None)
            self.assertEqual(len(processed), 1)

            self.fs.copy(
                processed_path,
                GcsfsFilePath.from_absolute_path(
                    to_normalized_unprocessed_file_path_from_normalized_path(
                        processed_path.abs_path(),
                        file_type_override=GcsfsDirectIngestFileType.
                        INGEST_VIEW)))
            self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH)

        ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None

        ingest_unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter)
        self.assertEqual(len(ingest_unprocessed), 1)
        self.assertTrue(self.fs.is_seen_unprocessed_file(
            ingest_unprocessed[0]))

        # ... file is ingested

        # File is moved to processed path
        self.fs.mv_path_to_processed_path(ingest_unprocessed[0])
        processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH,
                                                     None)
        self.assertEqual(len(processed), 1)
        self.assertTrue(self.fs.is_processed_file(processed[0]))

        unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, None)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \
            if file_type_differentiation_on else None

        self.fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            date_str_bound=dt.date().isoformat(),
            include_bound=True,
            file_type_filter=ingest_move_type_filter)

        end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH,
                                                        '',
                                                        file_type_filter=None)
        end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                         '',
                                                         file_type_filter=None)
        if file_type_differentiation_on:
            end_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            end_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            end_raw_storage_paths = []
            end_ingest_view_storage_paths = []

        # Each file gets re-exported as ingest view
        splitting_factor = 2 if file_type_differentiation_on else 1

        expected_final_total_files = start_num_total_files + splitting_factor - 1
        self.assertEqual(len(self.fs.gcs_file_system.all_paths),
                         expected_final_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths),
                         len(start_storage_paths) + 1 * splitting_factor)
        if file_type_differentiation_on:
            self.assertEqual(
                len(end_raw_storage_paths) +
                len(end_ingest_view_storage_paths), len(end_storage_paths))
            self.assertEqual(len(end_raw_storage_paths),
                             len(start_raw_storage_paths) + 1)
            self.assertEqual(len(end_ingest_view_storage_paths),
                             len(start_ingest_view_storage_paths) + 1)

        for sp in end_storage_paths:
            parts = filename_parts_from_path(sp)
            if sp.abs_path() not in {
                    p.abs_path()
                    for p in start_storage_paths
            }:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                dir_path, storage_file_name = os.path.split(sp.abs_path())
                if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED:
                    self.assertTrue(parts.file_type.value in dir_path)
                name, _ = path.file_name.split('.')
                self.assertTrue(name in storage_file_name)