示例#1
0
def _UpdateLatestServerDebDirectory(gcs_bucket: storage.Bucket,
                                    gcs_build_results_dir: str):
    """Updates the '_latest_server_deb' GCS directory with the latest results."""
    logging.info("Updating latest server deb directory.")

    old_build_results = list(
        gcs_bucket.list_blobs(prefix=_LATEST_SERVER_DEB_GCS_DIR))
    new_build_results = list(
        gcs_bucket.list_blobs(prefix=gcs_build_results_dir))
    if not new_build_results:
        raise GCSUploadError(
            "Failed to find build results for the server-deb Travis job.")

    for gcs_blob in old_build_results:
        logging.info("Deleting previous blob: %s", gcs_blob)
        gcs_blob.delete()

    for gcs_blob in new_build_results:
        build_result_filename = gcs_blob.name.split("/")[-1]
        latest_build_result_path = (
            f"{_LATEST_SERVER_DEB_GCS_DIR}/{build_result_filename}")
        logging.info("Copying blob %s (%s) -> %s", gcs_blob, gcs_bucket,
                     latest_build_result_path)
        gcs_bucket.copy_blob(gcs_blob,
                             gcs_bucket,
                             new_name=latest_build_result_path)
示例#2
0
文件: gcs.py 项目: werbolis/airflow
 def _prepare_sync_plan(
     source_bucket: storage.Bucket,
     destination_bucket: storage.Bucket,
     source_object: Optional[str],
     destination_object: Optional[str],
     recursive: bool,
 ) -> Tuple[Set[storage.Blob], Set[storage.Blob], Set[storage.Blob]]:
     # Calculate the number of characters that remove from the name, because they contain information
     # about the parent's path
     source_object_prefix_len = len(source_object) if source_object else 0
     destination_object_prefix_len = len(
         destination_object) if destination_object else 0
     delimiter = "/" if not recursive else None
     # Fetch blobs list
     source_blobs = list(
         source_bucket.list_blobs(prefix=source_object,
                                  delimiter=delimiter))
     destination_blobs = list(
         destination_bucket.list_blobs(prefix=destination_object,
                                       delimiter=delimiter))
     # Create indexes that allow you to identify blobs based on their name
     source_names_index = {
         a.name[source_object_prefix_len:]: a
         for a in source_blobs
     }
     destination_names_index = {
         a.name[destination_object_prefix_len:]: a
         for a in destination_blobs
     }
     # Create sets with names without parent object name
     source_names = set(source_names_index.keys())
     destination_names = set(destination_names_index.keys())
     # Determine objects to copy and delete
     to_copy = source_names - destination_names
     to_delete = destination_names - source_names
     to_copy_blobs = {source_names_index[a]
                      for a in to_copy}  # type: Set[storage.Blob]
     to_delete_blobs = {destination_names_index[a]
                        for a in to_delete}  # type: Set[storage.Blob]
     # Find names that are in both buckets
     names_to_check = source_names.intersection(destination_names)
     to_rewrite_blobs = set()  # type: Set[storage.Blob]
     # Compare objects based on crc32
     for current_name in names_to_check:
         source_blob = source_names_index[current_name]
         destination_blob = destination_names_index[current_name]
         # if the objects are different, save it
         if source_blob.crc32c != destination_blob.crc32c:
             to_rewrite_blobs.add(source_blob)
     return to_copy_blobs, to_delete_blobs, to_rewrite_blobs
def get_latest_version_from_bucket(pack_id: str,
                                   production_bucket: Bucket) -> str:
    """ Retrieves the latest version of pack in the bucket

    Args:
        pack_id (str): The pack id to retrieve the latest version
        production_bucket (Bucket): The GCS production bucket

    Returns: The latest version of the pack as it is in the production bucket

    """
    pack_bucket_path = os.path.join(GCPConfig.STORAGE_BASE_PATH, pack_id)
    logging.debug(
        f'Trying to get latest version for pack {pack_id} from bucket path {pack_bucket_path}'
    )
    # Adding the '/' in the end of the prefix to search for the exact pack id
    pack_versions_paths = [
        f.name
        for f in production_bucket.list_blobs(prefix=f'{pack_bucket_path}/')
        if f.name.endswith('.zip')
    ]
    pack_versions = [
        LooseVersion(PACK_PATH_VERSION_REGEX.findall(path)[0])
        for path in pack_versions_paths
    ]
    logging.debug(
        f'Found the following zips for {pack_id} pack: {pack_versions}')
    if pack_versions:
        pack_latest_version = max(pack_versions).vstring
        return pack_latest_version
    else:
        logging.error(
            f'Could not find any versions for pack {pack_id} in bucket path {pack_bucket_path}'
        )
示例#4
0
def clear_remote_dags_bucket(bucket: Bucket):
    i = 0
    for blob in bucket.list_blobs(prefix='dags'):
        if not blob.name in ['dags/', 'dags/airflow_monitoring.py']:
            print(f"deleting file {_blob_uri(blob)}")
            blob.delete()
            i += 1

    print(f"{i} files deleted")
示例#5
0
def download_metadata_from_gcs(bucket: storage.Bucket,
                               local_sample_path: ComparisonPath) -> None:
    (local_sample_path / "operations").mkdir_p()

    prefix = str(local_sample_path)
    blobs = bucket.list_blobs(prefix=prefix)
    for blob in blobs:
        if not blob.name.endswith('/digest.json'):
            logging.info(f'Downloading blob: {blob.name}')
            blob.download_to_filename(blob.name)
示例#6
0
def timeSort(bucket: Bucket,
             prefix: str,
             num: Optional[int] = None) -> List[Image]:
    blobs = bucket.list_blobs(prefix=prefix)
    imgs = [
        Image(el.public_url) for el in blobs if el.public_url.endswith(".png")
    ]
    simgs = sorted(imgs, key=lambda x: (x.date, x.seq), reverse=True)

    if num:
        return simgs[:num]
    return simgs
示例#7
0
def sync_gcs_to_box(bucket: Bucket, box: BoxClient,
                    cache: dict) -> List[Future]:
    # constuct an executor for copy tasks
    executor = ThreadPoolExecutor(max_workers=cpu_count())
    futures = []

    for blob in bucket.list_blobs():
        if cache.get(blob.name, False):
            # Found the blob in Box
            LOG.debug("Blob {} already in Box.".format(blob.name))

        else:
            # Did not find the Blob in box
            if blob.metadata and blob.metadata[BOX_MTIME_KEY]:
                LOG.info(
                    "Found blob {} in bucket that was synced, but no longer exists in Box. Deleting."
                    .format(blob.name))
                blob.delete()

            else:
                if blob.name[-1] == '/':
                    LOG.info(
                        "Found new folder {} not in Box. Creating.".format(
                            blob.name))
                    path = blob.name.split("/")[:-1]
                    # do this serially, as there should be few.
                    # Ideally, box_mkdir_p never misses cache when making files as the folder will sort first
                    box_mkdir_p(box, path, cache)
                else:
                    # Found a file that doesn't seem to be in Box.
                    blob_name = blob.name
                    LOG.info("Found new blob {} not in Box. Uploading.".format(
                        blob_name))
                    # split name by slashes; last item is file, the previous are folders
                    tokens = blob.name.split("/")
                    path, filename = tokens[:-1], tokens[-1]
                    target_folder = box_mkdir_p(box, path, cache)
                    # prepare the copy
                    temp_file = BytesIO()
                    reader = blob.download_to_file
                    writer = lambda temp: target_folder.upload_stream(
                        temp, filename)
                    transfer_callback = lambda bf: patch_blob_metadata(
                        bucket, blob_name, bf)
                    # submit the copy work
                    future = executor.submit(concurrent_upload, reader, writer,
                                             temp_file, transfer_callback)
                    futures.append(future)

    return futures
示例#8
0
def get_files(client: storage.Client, bucket: storage.Bucket) -> List[dict]:
    """Retrieves all files in a given GCS bucket

    Args:
        client: Object representing Python GCS client
        bucket: google.cloud.storage.Bucket holding bucket name

    Returns:
       List of dicts [{name: String holding file name,
                        type: String representing type of file, 'audio/flac'.
                       }]
    """
    bucket = client.get_bucket(bucket)
    return [{
        'name': blob.name,
        'type': blob.content_type
    } for blob in list(bucket.list_blobs())]
def get_latest_version_from_bucket(pack_id: str,
                                   production_bucket: Bucket) -> str:
    """ Retrieves the latest version of pack in the bucket

    Args:
        pack_id (str): The pack id to retrieve the latest version
        production_bucket (Bucket): The GCS production bucket

    Returns: The latest version of the pack as it is in the production bucket

    """
    pack_bucket_path = os.path.join(GCPConfig.STORAGE_BASE_PATH, pack_id)
    # Adding the '/' in the end of the prefix to search for the exact pack id
    pack_versions_paths = [
        f.name
        for f in production_bucket.list_blobs(prefix=f'{pack_bucket_path}/')
        if f.name.endswith('.zip')
    ]
    pack_versions = [
        LooseVersion(PACK_PATH_VERSION_REGEX.findall(path)[0])
        for path in pack_versions_paths
    ]
    pack_latest_version = max(pack_versions).vstring
    return pack_latest_version