예제 #1
0
    def test_ingest_view_export(self, mock_supported, mock_region,
                                mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='my_ingest_view',
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30))

        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': export_args.to_serializable(),
            'args_type': 'GcsfsIngestViewExportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/ingest_view_export',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_ingest_view_export.assert_called_with(export_args)
    def test_get_ingest_view_metadata_pending_export_all_exported_in_region(
            self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        args_other_region = GcsfsIngestViewExportArgs(
            ingest_view_name='other_file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(args)
            self.metadata_manager_other_region.register_ingest_file_export_job(
                args_other_region)

        with freeze_time('2015-01-02T03:07:07'):
            path = self._make_unprocessed_path(
                'bucket/file_tag.csv',
                file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=datetime.datetime.utcnow())
            metadata = self.metadata_manager.get_ingest_view_metadata_for_export_job(
                args)
            self.metadata_manager.register_ingest_view_export_file_name(
                metadata, path)
            # ... export actually performed in here
            self.metadata_manager.mark_ingest_view_exported(metadata)

        self.assertEqual(
            [],
            self.metadata_manager.get_ingest_view_metadata_pending_export())
    def test_get_ingest_view_metadata_for_most_recent_valid_job(self):
        with freeze_time('2015-01-02T03:05:05'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='file_tag',
                    upper_bound_datetime_prev=None,
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 2, 2, 2, 2)))

        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='file_tag',
                    upper_bound_datetime_prev=datetime.datetime(
                        2015, 1, 2, 2, 2, 2, 2),
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 3, 3, 3, 3)))

        with freeze_time('2015-01-02T03:07:07'):
            self.metadata_manager.register_ingest_file_export_job(
                GcsfsIngestViewExportArgs(
                    ingest_view_name='another_tag',
                    upper_bound_datetime_prev=datetime.datetime(
                        2015, 1, 2, 3, 3, 3, 3),
                    upper_bound_datetime_to_export=datetime.datetime(
                        2015, 1, 2, 3, 4, 4, 4)))

        most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job(
            'file_tag')

        self.assertIsNotNone(most_recent_valid_job)
        self.assertEqual('file_tag', most_recent_valid_job.file_tag)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
            most_recent_valid_job.datetimes_contained_lower_bound_exclusive)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 3, 3, 3, 3),
            most_recent_valid_job.datetimes_contained_upper_bound_inclusive)

        # Invalidate the row that was just returned
        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(
            schema.DirectIngestIngestFileMetadata).filter_by(
                file_id=most_recent_valid_job.file_id).all()
        result = one(results)
        result.is_invalidated = True
        session.commit()

        most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job(
            'file_tag')
        self.assertIsNotNone(most_recent_valid_job)
        self.assertEqual('file_tag', most_recent_valid_job.file_tag)
        self.assertEqual(
            None,
            most_recent_valid_job.datetimes_contained_lower_bound_exclusive)
        self.assertEqual(
            datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
            most_recent_valid_job.datetimes_contained_upper_bound_inclusive)
    def test_create_direct_ingest_ingest_view_export_task(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_ingest_view",
            output_bucket_name="my_ingest_bucket",
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30),
        )
        body = {
            "cloud_task_args": export_args.to_serializable(),
            "args_type": "GcsfsIngestViewExportArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path"

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        url_params = {
            "region": _REGION.region_code,
            "output_bucket": "my_ingest_bucket",
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/ingest_view_export?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_ingest_view_export_task(
            _REGION, DirectIngestInstance.PRIMARY, export_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION,
            DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_ingest_view_file_same_args_after_invalidation(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)

        # Invalidate the previous row
        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()
        result = one(results)
        result.is_invalidated = True
        session.commit()

        # Now we can rerun with the same args
        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv',
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime.now())
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)
    def test_get_ingest_view_metadata_pending_export_basic(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        with freeze_time('2015-01-02T03:06:06'):
            self.metadata_manager.register_ingest_file_export_job(args)

        expected_list = [
            DirectIngestIngestFileMetadata.new_with_defaults(
                region_code='US_XX',
                file_tag='file_tag',
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=datetime.datetime(2015, 1, 2, 3, 6, 6),
                datetimes_contained_lower_bound_exclusive=datetime.datetime(
                    2015, 1, 2, 2, 2, 2, 2),
                datetimes_contained_upper_bound_inclusive=datetime.datetime(
                    2015, 1, 2, 3, 3, 3, 3))
        ]

        self.assertEqual(
            expected_list,
            self.metadata_manager.get_ingest_view_metadata_pending_export())
예제 #7
0
    def test_ingest_view_file_progression_same_args_twice_throws(self) -> None:
        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)

        with self.assertRaises(IntegrityError):
            ingest_view_unprocessed_path = self._make_unprocessed_path(
                "bucket/file_tag.csv",
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=datetime.datetime.now(),
            )
            self.run_ingest_view_file_progression(
                args, self.metadata_manager, ingest_view_unprocessed_path)

        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()
        self.assertEqual(1, len(results))
    def create_direct_ingest_ingest_view_export_task(
        self,
        region: Region,
        ingest_instance: DirectIngestInstance,
        ingest_view_export_args: GcsfsIngestViewExportArgs,
    ) -> None:
        task_id = _build_task_id(
            region.region_code,
            ingest_instance,
            task_id_tag=ingest_view_export_args.task_id_tag(),
            prefix_only=False,
        )
        params = {
            "region": region.region_code.lower(),
            "output_bucket": ingest_view_export_args.output_bucket_name,
        }
        relative_uri = f"/direct/ingest_view_export?{urlencode(params)}"

        body = self._get_body_from_args(ingest_view_export_args)

        self._get_bq_import_export_queue_manager(region,
                                                 ingest_instance).create_task(
                                                     task_id=task_id,
                                                     relative_uri=relative_uri,
                                                     body=body,
                                                 )
    def get_ingest_view_export_task_args(
            self) -> List[GcsfsIngestViewExportArgs]:
        """Looks at what files have been exported for a given region and returns args for all the export jobs that
        should be started, given what has updated in the raw data tables since the last time we exported data. Also
        returns any tasks that have not yet completed.
        """
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(
                f'Ingest view exports not enabled for region [{self.region.region_code}]'
            )

        logging.info('Gathering export state for each ingest tag')
        ingest_view_to_export_state = {}
        for ingest_view_tag, ingest_view in self.ingest_views_by_tag.items():
            export_state = self._get_export_state_for_ingest_view(ingest_view)
            self._validate_ascending_raw_file_update_dates(export_state)
            ingest_view_to_export_state[ingest_view_tag] = export_state
        logging.info('Done gathering export state for each ingest tag')

        # At this point we know that we have no new raw data backfills that should invalidate either pending or past
        # completed ingest view exports (checked in _validate_ascending_raw_file_update_dates()). We can now generate
        # any new jobs.

        jobs_to_schedule = []
        metadata_pending_export = self.file_metadata_manager.get_ingest_view_metadata_pending_export(
        )
        if metadata_pending_export:
            args_list = self._export_args_from_metadata(
                metadata_pending_export)
            jobs_to_schedule.extend(args_list)

        logging.info('Found [%s] already pending jobs to schedule.',
                     len(jobs_to_schedule))

        logging.info('Generating new ingest jobs.')
        for ingest_view_tag, export_state in ingest_view_to_export_state.items(
        ):
            lower_bound_datetime_exclusive = \
                export_state.last_export_metadata.datetimes_contained_upper_bound_inclusive \
                if export_state.last_export_metadata else None

            ingest_args_list = []
            for _date, upper_bound_datetime_inclusive in export_state.max_update_datetime_by_date:
                args = GcsfsIngestViewExportArgs(
                    ingest_view_name=ingest_view_tag,
                    upper_bound_datetime_prev=lower_bound_datetime_exclusive,
                    upper_bound_datetime_to_export=
                    upper_bound_datetime_inclusive)
                logging.info('Generating job args for tag [%s]: [%s].',
                             ingest_view_tag, args)

                self.file_metadata_manager.register_ingest_file_export_job(
                    args)
                ingest_args_list.append(args)
                lower_bound_datetime_exclusive = upper_bound_datetime_inclusive

            jobs_to_schedule.extend(ingest_args_list)

        logging.info('Returning [%s] jobs to schedule.', len(jobs_to_schedule))
        return jobs_to_schedule
예제 #10
0
 def _export_args_from_metadata(
         metadata_list: List[DirectIngestIngestFileMetadata]) -> List[GcsfsIngestViewExportArgs]:
     return [GcsfsIngestViewExportArgs(
         ingest_view_name=metadata.file_tag,
         upper_bound_datetime_prev=metadata.datetimes_contained_lower_bound_exclusive,
         upper_bound_datetime_to_export=metadata.datetimes_contained_upper_bound_inclusive
     ) for metadata in metadata_list]
예제 #11
0
    def test_ingest_view_file_progression_two_files_same_tag(self) -> None:

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
        )

        ingest_view_unprocessed_path_1 = self._make_unprocessed_path(
            "bucket/file_tag.csv",
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime(2015, 1, 2, 2, 2, 2, 2),
        )
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path_1)

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 3, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path_2 = self._make_unprocessed_path(
            "bucket/file_tag.csv",
            GcsfsDirectIngestFileType.INGEST_VIEW,
            dt=datetime.datetime(2015, 1, 3, 3, 3, 3, 3),
        )
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        results = session.query(schema.DirectIngestIngestFileMetadata).all()

        self.assertEqual(
            {
                ingest_view_unprocessed_path_1.file_name,
                ingest_view_unprocessed_path_2.file_name,
            },
            {r.normalized_file_name
             for r in results},
        )
        for r in results:
            self.assertTrue(r.export_time)
            self.assertTrue(r.processed_time)
예제 #12
0
    def test_create_direct_ingest_ingest_view_export_task(
            self, mock_client, mock_uuid):
        # Arrange
        project_id = 'recidiviz-456'
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='my_ingest_view',
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30))
        body = {
            'cloud_task_args': export_args.to_serializable(),
            'args_type': 'GcsfsIngestViewExportArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        queue_path = _REGION.shared_queue + '-path'

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/ingest_view_export?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
            project_id=project_id
        ).create_direct_ingest_ingest_view_export_task(_REGION, export_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_ingest_view_export(
        self,
        mock_supported: mock.MagicMock,
        mock_region: mock.MagicMock,
        mock_environment: mock.MagicMock,
    ) -> None:
        mock_supported.return_value = ["us_xx"]

        region_code = "us_xx"

        mock_environment.return_value = "staging"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="staging",
                                               ingestor=mock_controller)

        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_ingest_view",
            upper_bound_datetime_prev=datetime.datetime(2020, 4, 29),
            upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30),
        )

        request_args = {
            "region": region_code,
        }
        body = {
            "cloud_task_args": export_args.to_serializable(),
            "args_type": "GcsfsIngestViewExportArgs",
        }
        body_encoded = json.dumps(body).encode()

        headers = {"X-Appengine-Cron": "test-cron"}

        response = self.client.post(
            "/ingest_view_export",
            query_string=request_args,
            headers=headers,
            data=body_encoded,
        )
        self.assertEqual(200, response.status_code)
        mock_controller.do_ingest_view_export.assert_called_with(export_args)
    def test_ingest_view_file_progression(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args, self.metadata_manager,
                                              ingest_view_unprocessed_path)
예제 #15
0
    def test_gcsfs_ingest_view_export_args(self) -> None:
        dt_lower = datetime.datetime(2019, 1, 22, 11, 22, 33, 444444)
        dt_upper = datetime.datetime(2019, 11, 22, 11, 22, 33, 444444)

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_file_tag",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=dt_upper,
        )

        self.assertEqual(
            "ingest_view_export_my_file_tag-None-2019_11_22_11_22_33_444444",
            args.task_id_tag(),
        )

        args = GcsfsIngestViewExportArgs(
            ingest_view_name="my_file_tag",
            upper_bound_datetime_prev=dt_lower,
            upper_bound_datetime_to_export=dt_upper,
        )

        self.assertEqual(
            "ingest_view_export_my_file_tag-2019_01_22_11_22_33_444444-2019_11_22_11_22_33_444444",
            args.task_id_tag(),
        )
    def run_parse_file_test(self, expected: IngestInfo,
                            fixture_file_name: str) -> IngestInfo:
        """Runs a test that reads and parses a given fixture file. Returns the
        parsed IngestInfo object for tests to run further validations."""
        args = ingest_args_for_fixture_file(self.controller,
                                            f"{fixture_file_name}.csv")

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=fixture_file_name,
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            fixture_util.add_direct_ingest_path(
                self.controller.fs.gcs_file_system,
                args.file_path,
                region_code=self.controller.region_code(),
            )

        # pylint:disable=protected-access
        fixture_contents_handle = self.controller._get_contents_handle(args)

        if fixture_contents_handle is None:
            self.fail("fixture_contents_handle should not be None")
        final_info = self.controller._parse(args, fixture_contents_handle)

        print_visible_header_label("FINAL")
        print(final_info)

        print_visible_header_label("EXPECTED")
        print(expected)

        self.assertEqual(expected, final_info)

        return final_info
예제 #17
0
    def test_exportViewForArgs_detectRowDeletionView_noLowerBound(
            self) -> None:
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(
            region, is_detect_row_deletion_view=True)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2,
        )

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name="normalized_file_name",
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.
            upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.
            upper_bound_datetime_to_export,
        )
        expected_metadata = attr.evolve(self.to_entity(metadata),
                                        export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.run_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called(
        )
        self.mock_client.delete_table.assert_not_called()

        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(
            one(
                assert_session.query(
                    schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
예제 #18
0
    def test_exportViewForArgs_noLowerBound(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=None,
            upper_bound_datetime_to_export=_DATE_2)

        session = SessionFactory.for_schema_base(OperationsBase)
        metadata = schema.DirectIngestIngestFileMetadata(
            file_id=_ID,
            region_code=region.region_code,
            file_tag=export_args.ingest_view_name,
            normalized_file_name='normalized_file_name',
            is_invalidated=False,
            is_file_split=False,
            job_creation_time=_DATE_1,
            export_time=None,
            datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev,
            datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export
        )
        expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4)

        session.add(metadata)
        session.commit()
        session.close()

        # Act
        with freeze_time(_DATE_4.isoformat()):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_has_calls([
            mock.call(
                dataset_id='us_xx_ingest_views',
                overwrite=True,
                query=mock.ANY,
                query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)],
                table_id='ingest_view_2020_07_20_00_00_00_upper_bound'),
        ])
        expected_query = \
            'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \
            'ORDER BY colA, colC;'
        self.assert_exported_to_gcs_with_query(expected_query)
        self.mock_client.delete_table.assert_has_calls([
            mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')])
        assert_session = SessionFactory.for_schema_base(OperationsBase)
        found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all()))
        self.assertEqual(expected_metadata, found_metadata)
        assert_session.close()
 def json_to_cloud_task_args(json_data: dict) -> Optional[CloudTaskArgs]:
     if "cloud_task_args" in json_data and "args_type" in json_data:
         args_type = json_data["args_type"]
         cloud_task_args_dict = json_data["cloud_task_args"]
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error("Unexpected args_type in json_data: %s", args_type)
     return None
예제 #20
0
    def test_ingest_then_split_progression(self) -> None:
        args = GcsfsIngestViewExportArgs(
            ingest_view_name="file_tag",
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3),
        )

        ingest_view_unprocessed_path = self._make_unprocessed_path(
            "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        self.run_ingest_view_file_progression(args,
                                              self.metadata_manager,
                                              ingest_view_unprocessed_path,
                                              split_file=True)
 def json_to_cloud_task_args(json_data: dict):
     if 'cloud_task_args' in json_data and 'args_type' in json_data:
         args_type = json_data['args_type']
         cloud_task_args_dict = json_data['cloud_task_args']
         if args_type == IngestArgs.__name__:
             return IngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error('Unexpected args_type in json_data: %s', args_type)
     return None
예제 #22
0
    def test_getIngestViewExportTaskArgs_happy(self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=True)
        export_manager = self.create_export_manager(region)
        export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock(  # type: ignore
            return_value=DirectIngestIngestFileMetadata(
                file_id=_ID,
                region_code=region.region_code,
                file_tag="ingest_view",
                normalized_file_name="normalized_file_name",
                processed_time=_DATE_1,
                is_invalidated=False,
                is_file_split=False,
                job_creation_time=_DATE_1,
                export_time=_DATE_1,
                datetimes_contained_lower_bound_exclusive=_DATE_1,
                datetimes_contained_upper_bound_inclusive=_DATE_1,
                discovery_time=_DATE_1,
            ))
        export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock(  # type: ignore
            return_value=[
                DirectIngestRawFileMetadata(
                    file_id=2,
                    region_code=region.region_code,
                    file_tag="ingest_view",
                    discovery_time=_DATE_2,
                    normalized_file_name=
                    "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv",
                    processed_time=None,
                    datetimes_contained_upper_bound_inclusive=_DATE_2,
                )
            ])

        # Act
        args = export_manager.get_ingest_view_export_task_args()

        # Assert
        self.assertListEqual(
            args,
            [
                GcsfsIngestViewExportArgs(
                    ingest_view_name="ingest_view",
                    upper_bound_datetime_prev=_DATE_1,
                    upper_bound_datetime_to_export=_DATE_2,
                )
            ],
        )
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""
        get_region_patcher = patch(
            "recidiviz.persistence.entity_matching.state."
            "base_state_matching_delegate.get_region")
        mock_get_region = get_region_patcher.start()
        mock_get_region.return_value = self._fake_region()

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()

        file_type = (GcsfsDirectIngestFileType.INGEST_VIEW
                     if self.controller.region.
                     is_raw_vs_ingest_file_name_detection_enabled() else None)

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.are_ingest_view_exports_enabled_in_env():
            now = datetime.datetime.utcnow()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        get_region_patcher.stop()
        environ_patcher.stop()
    def create_direct_ingest_ingest_view_export_task(
            self, region: Region,
            ingest_view_export_args: GcsfsIngestViewExportArgs):
        task_id = _build_task_id(
            region.region_code,
            task_id_tag=ingest_view_export_args.task_id_tag(),
            prefix_only=False)
        relative_uri = f'/direct/ingest_view_export?region={region.region_code}'

        body = self._get_body_from_args(ingest_view_export_args)

        self.cloud_task_client.create_task(
            task_id=task_id,
            queue_name=DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2,
            relative_uri=relative_uri,
            body=body,
        )
예제 #25
0
    def create_direct_ingest_ingest_view_export_task(
            self, region: Region,
            ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        task_id = _build_task_id(
            region.region_code,
            task_id_tag=ingest_view_export_args.task_id_tag(),
            prefix_only=False,
        )
        relative_uri = f"/direct/ingest_view_export?region={region.region_code}"

        body = self._get_body_from_args(ingest_view_export_args)

        self.bq_import_export_cloud_task_queue_manager.create_task(
            task_id=task_id,
            relative_uri=relative_uri,
            body=body,
        )
예제 #26
0
    def test_exportViewForArgs_noExistingMetadata(self):
        # Arrange
        region = self.create_fake_region()
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name='ingest_view',
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2)

        # Act
        with pytest.raises(ValueError):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_table_from_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
    def test_register_ingest_view_export_file_name_already_exists_raises(self):
        args = GcsfsIngestViewExportArgs(
            ingest_view_name='file_tag',
            upper_bound_datetime_prev=datetime.datetime(
                2015, 1, 2, 2, 2, 2, 2),
            upper_bound_datetime_to_export=datetime.datetime(
                2015, 1, 2, 3, 3, 3, 3))
        metadata_entity = self.metadata_manager.register_ingest_file_export_job(
            args)
        self.metadata_manager.register_ingest_view_export_file_name(
            metadata_entity,
            self._make_unprocessed_path('bucket/file_tag.csv',
                                        GcsfsDirectIngestFileType.INGEST_VIEW))

        with self.assertRaises(ValueError):
            self.metadata_manager.register_ingest_view_export_file_name(
                metadata_entity,
                self._make_unprocessed_path(
                    'bucket/file_tag.csv',
                    GcsfsDirectIngestFileType.INGEST_VIEW))
예제 #28
0
    def test_exportViewForArgs_ingestViewExportsDisabled(self) -> None:
        # Arrange
        region = self.create_fake_region(ingest_view_exports_enabled=False)
        export_manager = self.create_export_manager(region)
        export_args = GcsfsIngestViewExportArgs(
            ingest_view_name="ingest_view",
            upper_bound_datetime_prev=_DATE_1,
            upper_bound_datetime_to_export=_DATE_2,
        )

        # Act
        with pytest.raises(ValueError):
            export_manager.export_view_for_args(export_args)

        # Assert
        self.mock_client.create_dataset_if_necessary.assert_not_called()
        self.mock_client.run_query_async.assert_not_called()
        self.mock_client.export_query_results_to_cloud_storage.assert_not_called(
        )
        self.mock_client.delete_table.assert_not_called()
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()
        file_type = GcsfsDirectIngestFileType.INGEST_VIEW

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now(tz=pytz.UTC)
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        environ_patcher.stop()
예제 #30
0
    def run_parse_file_test(self, expected: IngestInfo,
                            fixture_file_name: str) -> IngestInfo:
        """Runs a test that reads and parses a given fixture file. Returns the
        parsed IngestInfo object for tests to run further validations."""
        args = ingest_args_for_fixture_file(self.controller,
                                            f'{fixture_file_name}.csv')

        if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(self.controller.fs)}]")

        if self.controller.region.are_ingest_view_exports_enabled_in_env():
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=fixture_file_name,
                upper_bound_datetime_to_export=datetime.datetime.utcnow(),
                upper_bound_datetime_prev=None)

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            self.controller.fs.test_add_path(args.file_path)

        # pylint:disable=protected-access
        fixture_contents_handle = self.controller._get_contents_handle(args)

        final_info = self.controller._parse(args, fixture_contents_handle)

        print_visible_header_label('FINAL')
        print(final_info)

        print_visible_header_label('EXPECTED')
        print(expected)

        self.assertEqual(expected, final_info)

        return final_info