def test_get_unprocessed_raw_files_to_import(self) -> None:
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_second.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            raw_unprocessed,
                                            has_fixture=False)
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            ingest_view_unprocessed,
                                            has_fixture=False)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())
    def test_import_wrong_separator_cols_do_not_parse(self) -> None:
        file_config = self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"]
        updated_file_config = attr.evolve(file_config, separator="#")
        self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"] = updated_file_config

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertTrue(
            str(e.exception).startswith(
                "Found only one column: [COL1__COL2_COL3]. "
                "Columns likely did not parse properly."))
    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
            self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual))

        path = one(self.fs.uploaded_test_path_to_actual.keys())
        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=f'gs://{path}',
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagPipeSeparatedNonUTF8',
            destination_table_schema=[
                bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_raw_file_invalid_column_chars(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagInvalidCharacters.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagInvalidCharacters',
            destination_table_schema=[
                bigquery.SchemaField('COL_1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_3COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_4_COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagInvalidCharacters.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagInvalidCharacters",
            destination_table_schema=[
                bigquery.SchemaField("COL_1", "STRING", "NULLABLE"),
                bigquery.SchemaField("_COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("_3COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_unspecified_type_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.UNSPECIFIED)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))
    def test_import_bq_file_not_in_tags(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='this_path_tag_not_in_yaml.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))
    def test_import_bq_file_with_ingest_view_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))
    def test_get_unprocessed_raw_files_to_import(self):
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_second.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        self.fs.test_add_path(raw_unprocessed)
        self.fs.test_add_path(ingest_view_unprocessed)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())
    def test_import_bq_file_with_ingest_view_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertEqual(
            str(e.exception),
            "Unexpected file type [GcsfsDirectIngestFileType.INGEST_VIEW] for "
            "path [file_tag_first].",
        )
    def test_import_bq_file_not_in_tags(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="this_path_tag_not_in_yaml.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertEqual(
            str(e.exception),
            "Attempting to import raw file with tag [this_path_tag_not_in_yaml] "
            "unspecified by [us_xx] config.",
        )
    def test_import_bq_file_with_raw_file_normalization_conflict(self):
        with self.assertRaises(ValueError) as e:
            file_path = path_for_fixture_file_in_test_gcs_directory(
                directory=self.ingest_directory_path,
                filename='tagNormalizationConflict.csv',
                should_normalize=True,
                file_type=GcsfsDirectIngestFileType.RAW_DATA)

            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                file_path)

            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(
            str(e.exception),
            "Multiple columns with name [_4COL] after normalization.")
    def test_import_bq_file_multiple_chunks_uneven_division(self) -> None:

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        mock_query_jobs = [
            mock.MagicMock(),
            mock.MagicMock(),
        ]

        self.mock_big_query_client.run_query_async.side_effect = mock_query_jobs

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str="UPDATE `recidiviz-456.us_xx_raw_data.tagC` original\n"
                "SET COL1 = updates.new__COL1\n"
                "FROM (SELECT * FROM UNNEST([\n"
                "    STRUCT('123' AS COL1, CAST('2020-06-10T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1),\n"
                "    STRUCT('123' AS COL1, CAST('2020-09-21T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1)\n"
                "])) updates\n"
                "WHERE original.COL1 = updates.COL1 AND original.update_datetime = updates.update_datetime;"
            ),
            mock.call(
                query_str="DELETE FROM `recidiviz-456.us_xx_raw_data.tagC`\n"
                "WHERE STRUCT(COL1) IN (\n"
                "    STRUCT('789')\n"
                ");"),
        ])

        for mock_query_job in mock_query_jobs:
            mock_query_job.result.assert_called_once()
    def test_import_bq_file_feature_not_released_throws(self):
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code='us_xx',
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))
    def test_import_bq_file_with_row_extra_columns(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagRowExtraColumns.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaisesRegex(ParserError,
                                    "Expected 4 fields in line 3, saw 5"):
            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(0, len(self.fs.gcs_file_system.uploaded_paths))
        self._check_no_temp_files_remain()
    def test_import_bq_file_multiple_chunks_uneven_division(self):

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(source_uri=uploaded_path.uri(),
                 destination_dataset_ref=bigquery.DatasetReference(
                     self.project_id, 'us_xx_raw_data'),
                 destination_table_id='tagPipeSeparatedNonUTF8',
                 destination_table_schema=[
                     bigquery.SchemaField('PRIMARY_COL1', 'STRING',
                                          'NULLABLE'),
                     bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                     bigquery.SchemaField('update_datetime', 'DATETIME',
                                          'REQUIRED')
                 ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagDoubleDaggerWINDOWS1252.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagDoubleDaggerWINDOWS1252",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str=
                f"UPDATE `recidiviz-456.us_xx_raw_data.tagC` SET COL1 = '456' WHERE COL1 = '123' AND update_datetime = '{file_datetime.isoformat()}';"
            ),
            mock.call(
                query_str=
                "DELETE FROM `recidiviz-456.us_xx_raw_data.tagC` WHERE COL1 = '789';"
            ),
        ])