Пример #1
0
    def _open_gcs_url(self, binary) -> object:
        mode = "rb" if binary else "r"
        service_account_json = self._provider.get("service_account_json")
        credentials = None
        if service_account_json:
            try:
                credentials = json.loads(
                    self._provider["service_account_json"])
            except json.decoder.JSONDecodeError as err:
                error_msg = f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}"
                logger.error(error_msg)
                raise ConfigurationError(error_msg) from err

        if credentials:
            credentials = service_account.Credentials.from_service_account_info(
                credentials)
            client = GCSClient(credentials=credentials,
                               project=credentials._project_id)
        else:
            client = GCSClient.create_anonymous_client()
        file_to_close = smart_open.open(self.full_url,
                                        transport_params=dict(client=client),
                                        mode=mode)

        return file_to_close
Пример #2
0
def main():
    args = get_args()
    # Log in to Box
    LOG.info("Authenticating with Box and impersonating user {}.".format(
        args.login))

    kms = kms_v1.KeyManagementServiceClient()
    config_ciphertext = open(args.config, 'rb').read()
    config_plaintext = kms.decrypt(args.keyname, config_ciphertext).plaintext

    box = impersonate_mirror_user(get_box_client(config_plaintext), args.login)
    # Log in to GCS and get bucket
    bucket_name = args.bucket.replace("gs://", "")
    LOG.info(
        "Authenticating with GCS and fetching bucket {}.".format(bucket_name))
    bucket = GCSClient().get_bucket(bucket_name)
    # Walk Box, schedule async copies to GCS, and get a list of new blobs.
    # We will also opportunistically form a cache of the Box items.
    LOG.info("Walking Box directories and copying to GCS as needed.")
    box_cache = {'/': box.root_folder()}
    copy_jobs = sync_box_to_gcs(box, bucket, cache=box_cache)
    # Check for and log exceptions. Doing this here also institutes a "pause" between the two sync phases.
    for exc in get_exceptions(copy_jobs):
        LOG.exception(exc)
    # Walk GCS, checking against the cache of Box items, and as needed, schedule async uploads to Box
    LOG.info("Listing GCS blobs and looking for blobs to upload or delete.")
    copy_jobs = sync_gcs_to_box(bucket, box, cache=box_cache)
    # Check for and log exceptions
    for exc in get_exceptions(copy_jobs):
        LOG.exception(exc)
    LOG.info("Synchronization complete.")
Пример #3
0
def main():
    args = parse_args()

    # Imports of thor modules are deferred until after argument parsing to avoid
    # numba JIT time if the arguments are invalid or the user asked for --help.
    import thor.utils.logging

    thor.utils.logging.setupLogger("thor")

    from thor.taskqueue.client import Client as TaskQueueClient
    from thor.taskqueue.queue import TaskQueueConnection
    from thor.orbits import Orbits
    from thor.config import Config

    if not isinstance(args.config, str):
        config = Config
    else:
        config = Config.fromYaml(args.config)

    # Read observations
    preprocessed_observations = pd.read_csv(args.preprocessed_observations,
                                            index_col=False,
                                            dtype={"obs_id": str})

    # Read test orbits
    test_orbits = Orbits.from_csv(args.test_orbits)

    # Connect to Rabbit
    queue = TaskQueueConnection(
        pika.ConnectionParameters(
            host=args.rabbit_host,
            port=args.rabbit_port,
            credentials=pika.PlainCredentials(
                username=args.rabbit_username,
                password=args.rabbit_password,
            ),
        ),
        args.queue,
    )
    queue.connect()

    # Connect to GCS bucket
    gcs = GCSClient()
    if args.create_bucket:
        try:
            gcs.create_bucket(args.bucket)
        except google.cloud.exceptions.Conflict:
            # Bucket already exists.
            pass
    bucket = gcs.bucket(args.bucket)
    taskqueue_client = TaskQueueClient(bucket, queue)

    manifest = taskqueue_client.launch_job(config, preprocessed_observations,
                                           test_orbits)
    taskqueue_client.monitor_job_status(manifest.job_id)
    taskqueue_client.download_results(manifest, args.out_dir)
Пример #4
0
def google_storage_bucket(request):
    client = GCSClient()
    bucket_name = f"test_bucket__{request.function.__name__}"
    try:
        bucket = client.create_bucket(bucket_name)
    except google.cloud.exceptions.Conflict:
        logger.warning("bucket %s already exists; tests may be unpredictable",
                       bucket_name)
        bucket = client.bucket(bucket_name)
    yield bucket
    bucket.delete(force=True, client=client)
def main(project_id, dataset_id, bucket_name, hpo_id, folder_name):
    """
    Main function to load submission into dataset

    :param project_id: Identifies the project
    :param dataset_id: Identifies the destination dataset
    :param bucket_name: the bucket in GCS containing the archive files
    :param hpo_id: Identifies the HPO site
    :param folder_name: Name of the submission folder to load
    :return:
    """
    bq_client = get_client(project_id)
    gcs_client = GCSClient(project_id)
    site_bucket = get_bucket(bq_client, hpo_id)
    prefix = f'{hpo_id}/{site_bucket}/{folder_name}'
    LOGGER.info(
        f'Starting jobs for loading {bucket_name}/{prefix} into {dataset_id}')
    _ = load_folder(dataset_id, bq_client, bucket_name, prefix, gcs_client,
                    hpo_id)
    LOGGER.info(f'Successfully loaded {bucket_name}/{prefix} into {dataset_id}')
Пример #6
0
 def gcs(self):
     if self._gcs is None:
         self._gcs = GCSClient()
     return self._gcs
Пример #7
0
 def storageclient(self):
     if self._storageclient is None:
         self._storageclient = GCSClient()
     return self._storageclient
Пример #8
0
def test_client_roundtrip(queue_connection, google_storage_bucket, orbits,
                          observations):
    taskqueue_client = client.Client(google_storage_bucket, queue_connection)
    taskqueue_worker = client.Worker(GCSClient(), queue_connection)

    # trim down to 3 orbits
    orbits = Orbits.from_df(orbits.to_df()[:3])
    n_task = 3

    manifest = taskqueue_client.launch_job(test_config, observations, orbits)
    assert len(manifest.task_ids) == n_task

    statuses = taskqueue_client.get_task_statuses(manifest)
    assert len(statuses) == n_task

    assert all(s.state == tasks.TaskState.REQUESTED for s in statuses.values()
               ), "all tasks should initially be in 'requested' state"

    received_tasks = list(
        taskqueue_worker.poll_for_tasks(poll_interval=0.5, limit=5))
    assert len(received_tasks) == n_task

    statuses = taskqueue_client.get_task_statuses(manifest)
    assert all(s.state == tasks.TaskState.IN_PROGRESS for s in statuses.values(
    )), "all tasks should be in 'in_progress' state once received"

    # Handle the first task. It should be marked as succeeded, but others still
    # in progress.
    taskqueue_worker.handle_task(received_tasks[0])
    statuses = taskqueue_client.get_task_statuses(manifest)
    task1_state, task2_state, task3_state = (
        statuses[received_tasks[0].task_id].state,
        statuses[received_tasks[1].task_id].state,
        statuses[received_tasks[2].task_id].state,
    )
    assert task1_state == tasks.TaskState.SUCCEEDED
    assert task2_state == tasks.TaskState.IN_PROGRESS
    assert task3_state == tasks.TaskState.IN_PROGRESS

    # Download results. We should only have results for the first task.
    with tempfile.TemporaryDirectory(
            prefix="thor.test_client_roundtrip_1") as outdir:
        taskqueue_client.download_results(manifest, outdir)
        _assert_results_downloaded(outdir, received_tasks[0].task_id)

    # Handle another task.
    taskqueue_worker.handle_task(received_tasks[1])
    statuses = tasks.get_task_statuses(google_storage_bucket, manifest)
    task1_state, task2_state, task3_state = (
        statuses[received_tasks[0].task_id].state,
        statuses[received_tasks[1].task_id].state,
        statuses[received_tasks[2].task_id].state,
    )
    assert task1_state == tasks.TaskState.SUCCEEDED
    assert task2_state == tasks.TaskState.SUCCEEDED
    assert task3_state == tasks.TaskState.IN_PROGRESS

    # Download results. Now we should have results for the first two tasks.
    with tempfile.TemporaryDirectory(
            prefix="thor.test_client_roundtrip_2") as outdir:
        taskqueue_client.download_results(manifest, outdir)
        _assert_results_downloaded(outdir, received_tasks[0].task_id)
        _assert_results_downloaded(outdir, received_tasks[1].task_id)