def test_handle_file_start_ingest_unsupported_region(
            self, mock_region, mock_environment, mock_fs_factory_cls, client):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging')

        fake_fs = FakeDirectIngestGCSFileSystem()
        fake_fs.test_add_path('bucket-us-nd/elite_offenders.csv')
        mock_fs_factory_cls.build.return_value = fake_fs

        request_args = {
            'region': region_code,
            'bucket': 'bucket-us-nd',
            'relative_file_path': 'elite_offenders.csv',
            'start_ingest': 'true',
        }
        headers = {'X-Appengine-Cron': "test-cron"}

        with pytest.raises(DirectIngestError):
            response = client.get('/handle_direct_ingest_file',
                                  query_string=request_args,
                                  headers=headers)

            # Even though the region isn't supported, we don't crash
            assert response.status_code == 400

        mock_region.assert_called_with('us_nd', is_direct_ingest=True)
    def test_move_to_storage_with_conflict(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        dt = datetime.datetime.now()
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '')
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
    def test_direct_ingest_multiple_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
    def test_handle_file_no_start_ingest(self, mock_fs_factory_cls, client):
        fake_fs = FakeDirectIngestGCSFileSystem()
        fake_fs.test_add_path('bucket-us-nd/Elite_Offenders.csv')
        mock_fs_factory_cls.build.return_value = fake_fs

        region = 'us_nd'
        request_args = {
            'region': region,
            'bucket': 'bucket-us-nd',
            'relative_file_path': 'Elite_Offenders.csv',
            'start_ingest': 'false',
        }
        headers = {'X-Appengine-Cron': "test-cron"}
        response = client.get('/handle_direct_ingest_file',
                              query_string=request_args,
                              headers=headers)

        # Even though the region isn't supported, we don't crash
        assert response.status_code == 200
    def fully_process_file(self, test_fs: FakeDirectIngestGCSFileSystem,
                           dt: datetime.datetime, path: GcsfsFilePath):
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        test_fs.test_add_path(path)

        start_num_total_files = len(test_fs.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = test_fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '')
        start_storage_paths = test_fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '')

        # File is renamed to normalized path
        test_fs.mv_path_to_normalized_path(path, dt)

        unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(unprocessed), 1)
        self.assertTrue(test_fs.is_seen_unprocessed_file(unprocessed[0]))

        # ... file is processed

        # File is moved to processed path
        test_fs.mv_path_to_processed_path(unprocessed[0])
        processed = test_fs.get_processed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(processed), 1)
        self.assertTrue(test_fs.is_processed_file(processed[0]))

        unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        test_fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            dt.date().isoformat(),
            include_bound=True)

        end_ingest_paths = test_fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '')
        end_storage_paths = test_fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '')

        self.assertEqual(len(test_fs.all_paths), start_num_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1)

        for sp in end_storage_paths:
            if sp.abs_path() not in \
                    {p.abs_path() for p in start_storage_paths}:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                _, storage_file_name = \
                    os.path.split(sp.abs_path())
                name, _ = path.file_name.split('.')
                self.assertTrue(name in storage_file_name)
 def setUp(self) -> None:
     self.fs = FakeDirectIngestGCSFileSystem()
     self.prioritizer = GcsfsDirectIngestJobPrioritizer(
         self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'])
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase):
    """Tests for the GcsfsDirectIngestJobPrioritizer."""

    _DAY_1_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_2 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=7789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_3 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=10,
                                      minute=4,
                                      second=5,
                                      microsecond=678,
                                      tzinfo=datetime.timezone.utc)

    _DAY_2_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=3,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1 = _DAY_1_TIME_1.date()
    _DAY_2 = _DAY_2_TIME_1.date()

    _INGEST_BUCKET_PATH = \
        GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()
        self.prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'])

    FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures'

    def _normalized_path_for_filename(self, filename: str,
                                      dt: datetime.datetime) -> GcsfsFilePath:
        normalized_path = \
            to_normalized_unprocessed_file_path(
                os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                             filename), dt)
        return GcsfsFilePath.from_absolute_path(normalized_path)

    def _process_jobs_for_paths_with_no_gaps_in_expected_order(
            self, paths: List[GcsfsFilePath]):
        for path in paths:
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            if next_job_args is None:
                # Make mypy happy
                self.fail()
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

    def test_empty_fs(self):
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1_TIME_1.date().isoformat()))
        self.assertIsNone(self.prioritizer.get_next_job_args())

    def test_single_expected_file(self):
        path = self._normalized_path_for_filename('tagA.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order([path])

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagB
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files(self):

        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2)
        ]

        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_unexpected_file(self):
        # Only file is out of order
        path = self._normalized_path_for_filename('tagB.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

        next_job_args = self.prioritizer.get_next_job_args()
        self.assertIsNotNone(next_job_args)
        self.assertEqual(next_job_args.file_path, path)
        self.assertFalse(
            self.prioritizer.are_next_args_expected(next_job_args))

        # ... job runs eventually even though unexpected...

        self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagA
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_files_on_multiple_days(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_files_on_multiple_days_with_gap(self):
        """Runs a test where there are files on multiple days and there is a gap
        in the expected files for the first day.
        """
        paths = [
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)

            are_args_expected = \
                self.prioritizer.are_next_args_expected(next_job_args)
            if i == 0:
                self.assertFalse(are_args_expected)
            else:
                self.assertTrue(are_args_expected)

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_multiple_files_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files_times_out_of_order(self):
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = \
                self.prioritizer.are_more_jobs_expected_for_day(date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_run_multiple_copies_of_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA_2.csv',
                                               self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
示例#8
0
 def setup_method(self, _test_method):
     self.fs = FakeDirectIngestGCSFileSystem()
     self.prioritizer = GcsfsDirectIngestJobPrioritizer(
         self.fs, self._INGEST_DIRECTORY_PATH, ['tagA', 'tagB'])