Python to_normalized_unprocessed_file_path示例，recidiviz.ingest.direct.controllers.direct_ingest_gcs_file_system.to_normalized_unprocessed_file_path Python示例

示例#1

0

显示文件

    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)

示例#2

0

显示文件

文件： direct_ingest_util.py 项目： Recidiviz/pulse-data

def path_for_fixture_file_in_test_gcs_directory(
    *,
    bucket_path: GcsfsBucketPath,
    filename: str,
    should_normalize: bool,
    file_type: Optional[GcsfsDirectIngestFileType],
    dt: Optional[datetime.datetime] = None,
) -> GcsfsFilePath:
    file_path_str = filename

    if should_normalize:
        if not file_type:
            raise ValueError(
                "Expected file_type for path normalization but got None")
        file_path_str = to_normalized_unprocessed_file_path(
            original_file_path=file_path_str, file_type=file_type, dt=dt)

    file_path = GcsfsFilePath.from_directory_and_file_name(
        dir_path=bucket_path,
        file_name=file_path_str,
    )
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(
            f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}"
        )
    return file_path

示例#3

0

显示文件

文件： gcsfs_direct_ingest_controller.py 项目： jazzPouls/pulse-data

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        file_type = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime),
        )

示例#4

0

显示文件

文件： check_state_file_into_storage.py 项目： teymour-aldridge/pulse-data

 def _do_check_in_for_file(self, path: str) -> None:
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
             dt=self.datetime))
     self._copy_to_storage(path, normalized_file_name)

示例#5

0

显示文件

    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)

示例#6

0

显示文件

文件： direct_ingest_util.py 项目： jazzPouls/pulse-data

def path_for_fixture_file_in_test_gcs_directory(
    *,
    directory: GcsfsDirectoryPath,
    filename: str,
    should_normalize: bool,
    file_type: Optional[GcsfsDirectIngestFileType] = None,
    dt: Optional[datetime.datetime] = None,
) -> GcsfsFilePath:
    file_path_str = filename

    if should_normalize:
        if not file_type:
            file_type = GcsfsDirectIngestFileType.UNSPECIFIED

        file_path_str = to_normalized_unprocessed_file_path(
            original_file_path=file_path_str, file_type=file_type, dt=dt)

    file_path = GcsfsPath.from_bucket_and_blob_name(
        bucket_name=directory.bucket_name,
        blob_name=os.path.join(directory.relative_path, file_path_str),
    )
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(
            f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}"
        )
    return file_path

示例#7

0

显示文件

文件： gcsfs_direct_ingest_job_prioritizer_test.py 项目： Alex-sConjecture/pulse-data

 def _normalized_path_for_filename(self, filename: str,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                          filename), dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)

示例#8

0

显示文件

 def _upload_file(self, path: str) -> None:
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
             dt=self.datetime))
     self._copy_to_ingest_bucket(path, normalized_file_name)

示例#9

0

显示文件

文件： postgres_direct_ingest_file_metadata_manager_test.py 项目： Leo-Ryu/pulse-data

 def _make_unprocessed_path(
     path_str: str,
     file_type: GcsfsDirectIngestFileType,
     dt=datetime.datetime(2015, 1, 2, 3, 3, 3, 3)
 ) -> GcsfsFilePath:
     normalized_path_str = to_normalized_unprocessed_file_path(
         original_file_path=path_str, file_type=file_type, dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path_str)

示例#10

0

显示文件

def ingest_args_for_fixture_file(controller: GcsfsDirectIngestController,
                                 filename: str) -> GcsfsIngestArgs:
    original_path = os.path.join(controller.ingest_directory_path, filename)
    file_path = to_normalized_unprocessed_file_path(original_path)
    return GcsfsIngestArgs(
        ingest_time=datetime.datetime.now(),
        file_path=file_path,
    )

示例#11

0

显示文件

文件： gcsfs_direct_ingest_job_prioritizer_test.py 项目： Leo-Ryu/pulse-data

 def _normalized_path_for_filename(self, filename: str,
                                   file_type: GcsfsDirectIngestFileType,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             original_file_path=os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename),
             file_type=file_type,
             dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)

示例#12

0

显示文件

 def _upload_file(
         self, path_with_timestamp: Tuple[str, datetime.datetime]) -> None:
     path, timestamp = path_with_timestamp
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
             dt=timestamp))
     full_file_upload_path = GcsfsFilePath.from_directory_and_file_name(
         self.gcs_destination_path, normalized_file_name)
     self._copy_to_ingest_bucket(path, full_file_upload_path)

示例#13

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： dxy/pulse-data

    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)
        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = CloudTaskQueueInfo(queue_name='queue_name',
                                  task_names=[
                                      f'projects/path/to/random_task',
                                      f'projects/path/to/{full_task_name}'
                                  ])
        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)

示例#14

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： Recidiviz/pulse-data

    def test_create_direct_ingest_process_job_task_secondary(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            "bucket/ingest_view_name.csv",
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )
        ingest_args = GcsfsIngestArgs(
            datetime.datetime(year=2019, month=7, day=20),
            file_path=GcsfsFilePath.from_absolute_path(file_path),
        )
        body = {
            "cloud_task_args": ingest_args.to_serializable(),
            "args_type": "GcsfsIngestArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = "us-xx-process-queue-path"
        queue_name = "direct-ingest-state-us-xx-process-job-queue"

        task_name = "{}/{}-{}-{}".format(
            DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2, _REGION.region_code,
            date, uuid)
        url_params = {"region": _REGION.region_code, "file_path": file_path}
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": f"/direct/process_job?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_process_job_task(_REGION,
                                                DirectIngestInstance.SECONDARY,
                                                ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id,
            QUEUES_REGION,
            queue_name,
        )
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)

示例#15

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： Recidiviz/pulse-data

    def test_create_direct_ingest_raw_data_import_task(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        raw_data_path = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path(
                "bucket/raw_data_path.csv",
                file_type=GcsfsDirectIngestFileType.RAW_DATA,
            ))
        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=raw_data_path)
        body = {
            "cloud_task_args": import_args.to_serializable(),
            "args_type": "GcsfsRawDataBQImportArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path"

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        url_params = {
            "region": _REGION.region_code,
            "file_path": raw_data_path.abs_path(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/raw_data_import?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_raw_data_import_task(
            _REGION, DirectIngestInstance.PRIMARY, import_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION,
            DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)

示例#16

0

显示文件

文件： download_files_from_sftp.py 项目： Recidiviz/pulse-data

 def _fetch(
     self,
     connection: pysftp.Connection,
     file_path: str,
     file_timestamp: datetime.datetime,
 ) -> None:
     """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
     normalized_sftp_path = os.path.normpath(file_path)
     normalized_upload_path = GcsfsFilePath.from_directory_and_file_name(
         dir_path=self.download_dir,
         file_name=os.path.basename(
             to_normalized_unprocessed_file_path(
                 normalized_sftp_path,
                 file_type=GcsfsDirectIngestFileType.RAW_DATA,
                 dt=file_timestamp,
             )),
     )
     if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered(
             normalized_upload_path
     ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed(
             normalized_upload_path):
         logging.info("Downloading %s into %s", normalized_sftp_path,
                      self.download_dir)
         try:
             path = GcsfsFilePath.from_directory_and_file_name(
                 dir_path=self.download_dir, file_name=normalized_sftp_path)
             self.gcsfs.upload_from_contents_handle_stream(
                 path=path,
                 contents_handle=GcsfsSftpFileContentsHandle(
                     sftp_connection=connection, local_file_path=file_path),
                 content_type=BYTES_CONTENT_TYPE,
             )
             logging.info("Post processing %s", path.uri())
             self.downloaded_items.append((
                 self.delegate.post_process_downloads(path, self.gcsfs),
                 file_timestamp,
             ))
         except IOError as e:
             logging.info(
                 "Could not download %s into %s: %s",
                 normalized_sftp_path,
                 self.download_dir,
                 e.args,
             )
             self.unable_to_download_items.append(file_path)
     else:
         logging.info(
             "Skipping downloading %s because it has already been previously downloaded for ingest.",
             normalized_sftp_path,
         )
         self.skipped_files.append(file_path)

示例#17

0

显示文件

def path_for_fixture_file(
        controller: GcsfsDirectIngestController,
        filename: str,
        should_normalize: bool,
        dt: Optional[datetime.datetime] = None
) -> Union[GcsfsFilePath, GcsfsDirectoryPath]:
    file_path_str = filename

    if should_normalize:
        file_path_str = to_normalized_unprocessed_file_path(file_path_str, dt)

    return GcsfsPath.from_bucket_and_blob_name(
        bucket_name=controller.ingest_directory_path.bucket_name,
        blob_name=os.path.join(controller.ingest_directory_path.relative_path,
                               file_path_str))

示例#18

0

显示文件

文件： gcsfs_direct_ingest_controller.py 项目： xgenie-007/pulse-data

    def _create_split_file_path(self, original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        existing_suffix = \
            f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        updated_file_name = (
            f'{parts.file_tag}{existing_suffix}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}'
            f'.{parts.extension}')
        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                dt=parts.utc_upload_datetime))

示例#19

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： jazzPouls/pulse-data

    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client: MagicMock, mock_uuid: MagicMock,
            mock_datetime: MagicMock) -> None:
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            "bucket/file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        ingest_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime(year=2019, month=7, day=20),
            file_path=GcsfsFilePath.from_absolute_path(file_path),
        )
        body = {
            "cloud_task_args": ingest_args.to_serializable(),
            "args_type": "GcsfsIngestArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        mock_datetime.date.today.return_value = date
        queue_path = f"{_REGION.shared_queue}-path"

        task_name = _REGION.get_queue_name() + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/process_job?region={_REGION.region_code}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION, _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)

示例#20

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： dxy/pulse-data

    def test_is_task_queued_no_tasks(self):
        # Arrange
        info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[])

        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        args = IngestArgs(ingest_time=datetime.datetime.now())
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)

        # Act
        basic_args_queued = info.is_task_queued(_REGION, args)
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(basic_args_queued)
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))

示例#21

0

显示文件

    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client, mock_uuid, mock_datetime):
        # Arrange
        project_id = 'recidiviz-456'
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        ingest_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime(year=2019, month=7, day=20),
                file_path=GcsfsFilePath.from_absolute_path(file_path))
        body = {
            'cloud_task_args': ingest_args.to_serializable(),
            'args_type': 'GcsfsIngestArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        mock_datetime.date.today.return_value = date
        queue_path = _REGION.shared_queue + '-path'

        task_name = _REGION.shared_queue + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/process_job?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(project_id=project_id).\
            create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)

示例#22

0

显示文件

    def test_create_direct_ingest_raw_data_import_task(self, mock_client,
                                                       mock_uuid):
        # Arrange
        project_id = 'recidiviz-456'
        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        queue_path = _REGION.shared_queue + '-path'

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/raw_data_import?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
            project_id=project_id).create_direct_ingest_raw_data_import_task(
                _REGION, import_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)

示例#23

0

显示文件

文件： direct_ingest_util.py 项目： teymour-aldridge/pulse-data

def path_for_fixture_file_in_test_gcs_directory(
    *,
    directory: GcsfsDirectoryPath,
    filename: str,
    should_normalize: bool,
    file_type: Optional[GcsfsDirectIngestFileType] = None,
    dt: Optional[datetime.datetime] = None
) -> Union[GcsfsFilePath, GcsfsDirectoryPath]:
    file_path_str = filename

    if should_normalize:
        if not file_type:
            file_type = GcsfsDirectIngestFileType.UNSPECIFIED

        file_path_str = to_normalized_unprocessed_file_path(
            original_file_path=file_path_str, file_type=file_type, dt=dt)

    return GcsfsPath.from_bucket_and_blob_name(
        bucket_name=directory.bucket_name,
        blob_name=os.path.join(directory.relative_path, file_path_str))

示例#24

0

显示文件

文件： direct_ingest_control_test.py 项目： jazzPouls/pulse-data

    def test_raw_data_import(
        self,
        mock_supported: mock.MagicMock,
        mock_region: mock.MagicMock,
        mock_environment: mock.MagicMock,
    ) -> None:
        mock_supported.return_value = ["us_xx"]

        region_code = "us_xx"

        mock_environment.return_value = "staging"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="staging",
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    "bucket/raw_data_path.csv",
                    file_type=GcsfsDirectIngestFileType.RAW_DATA,
                )))
        request_args = {
            "region": region_code,
        }
        body = {
            "cloud_task_args": import_args.to_serializable(),
            "args_type": "GcsfsRawDataBQImportArgs",
        }
        body_encoded = json.dumps(body).encode()

        headers = {"X-Appengine-Cron": "test-cron"}

        response = self.client.post(
            "/raw_data_import",
            query_string=request_args,
            headers=headers,
            data=body_encoded,
        )
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)

示例#25

0

显示文件

文件： direct_ingest_cloud_task_manager_impl_test.py 项目： dxy/pulse-data

    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client, mock_uuid, mock_datetime):
        # Arrange
        ingest_args = GcsfsIngestArgs(
            datetime.datetime(year=2019, month=7, day=20),
            file_path=to_normalized_unprocessed_file_path('file_path.csv'))
        body = {
            'ingest_args': ingest_args.to_serializable(),
            'args_type': 'GcsfsIngestArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        mock_datetime.date.today.return_value = date
        queue_path = _REGION.shared_queue + '-path'

        task_name = _REGION.shared_queue + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks.types.Task(
            name=task_name,
            app_engine_http_request={
                'relative_uri':
                f'/direct/process_job?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl().\
            create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            metadata.project_id(), metadata.region(), _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)

示例#26

0

显示文件

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(
                updated_file_name,
                file_type=parts.file_type,
                dt=parts.utc_upload_datetime,
            ),
        )

示例#27

0

显示文件

    def test_is_task_queued_no_tasks(self):
        # Arrange
        info = ProcessIngestJobCloudTaskQueueInfo(queue_name='queue_name',
                                                  task_names=[])

        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        args = IngestArgs(ingest_time=datetime.datetime.now())
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        basic_args_queued = info.is_task_queued(_REGION, args)
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(basic_args_queued)
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))

示例#28

0

显示文件

 def _normalized_path_for_filename(self, filename: str,
                                   dt: datetime.datetime):
     normalized_path = \
         to_normalized_unprocessed_file_path(
             os.path.join(self._INGEST_DIRECTORY_PATH, filename), dt)
     return normalized_path

示例#29

0

显示文件

 def _do_check_in_for_file(self, path: str) -> None:
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path, dt=datetime.datetime.fromisoformat(self.date)))
     self._copy_to_storage(path, normalized_file_name)