Exemplo n.º 1
0
def process_hpo_copy(hpo_id):
    """copies over files from hpo bucket to drc bucket

    :hpo_id: hpo from which to copy
    """
    try:
        project_id = app_identity.get_application_id()
        storage_client = StorageClient(project_id)
        hpo_bucket = storage_client.get_hpo_bucket(hpo_id)
        drc_private_bucket = storage_client.get_drc_bucket()
        source_bucket = storage_client.bucket(hpo_bucket)
        destination_bucket = storage_client.bucket(drc_private_bucket)
        bucket_items = list_bucket(hpo_bucket)

        ignored_items = 0
        filtered_bucket_items = []
        for item in bucket_items:
            item_root = item['name'].split('/')[0] + '/'
            if item_root.lower() in common.IGNORE_DIRECTORIES:
                ignored_items += 1
            else:
                filtered_bucket_items.append(item)

        logging.info(f"Ignoring {ignored_items} items in {hpo_bucket}")

        prefix = f'{hpo_id}/{hpo_bucket}/'

        for item in filtered_bucket_items:
            item_name = item['name']
            source_blob = source_bucket.get_blob(item_name)
            destination_blob_name = f'{prefix}{item_name}'
            source_bucket.copy_blob(source_blob, destination_bucket,
                                    destination_blob_name)
    except BucketDoesNotExistError as bucket_error:
        bucket = bucket_error.bucket
        # App engine converts an env var set but left empty to be the string 'None'
        if bucket and bucket.lower() != 'none':
            logging.warning(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist"
            )
        else:
            logging.info(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset"
            )
    except HttpError as http_error:
        message = (
            f"Failed to copy files for hpo_id '{hpo_id}' due to the following "
            f"HTTP error: {http_error.content.decode()}")
        logging.exception(message)
Exemplo n.º 2
0
class RetractDataGcsTest(TestCase):

    @classmethod
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.hpo_id = test_util.FAKE_HPO_ID
        self.bucket = os.environ.get(f'BUCKET_NAME_FAKE')
        self.site_bucket = 'test_bucket'
        self.folder_1 = '2019-01-01-v1/'
        self.folder_2 = '2019-02-02-v2/'
        self.client = StorageClient(self.project_id)
        self.folder_prefix_1 = f'{self.hpo_id}/{self.site_bucket}/{self.folder_1}'
        self.folder_prefix_2 = f'{self.hpo_id}/{self.site_bucket}/{self.folder_2}'
        self.pids = [17, 20]
        self.skip_pids = [10, 25]
        self.project_id = 'project_id'
        self.sandbox_dataset_id = os.environ.get('UNIONED_DATASET_ID')
        self.pid_table_id = 'pid_table'
        self.gcs_bucket = self.client.bucket(self.bucket)
        self.client.empty_bucket(self.gcs_bucket)

    @patch('retraction.retract_data_gcs.extract_pids_from_table')
    @patch('gcs_utils.get_drc_bucket')
    @patch('gcs_utils.get_hpo_bucket')
    def test_integration_five_person_data_retraction_skip(
        self, mock_hpo_bucket, mock_bucket, mock_extract_pids):
        mock_hpo_bucket.return_value = self.site_bucket
        mock_bucket.return_value = self.bucket
        mock_extract_pids.return_value = self.skip_pids
        lines_to_remove = {}
        expected_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            # generate results files
            file_name = file_path.split('/')[-1]
            lines_to_remove[file_name] = 0
            with open(file_path, 'rb') as f:
                # skip header
                next(f)
                expected_lines_post[file_name] = []
                for line in f:
                    line = line.strip()
                    if line != b'':
                        expected_lines_post[file_name].append(line)

                # write file to cloud for testing
                blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')
                blob = self.gcs_bucket.blob(self.folder_prefix_2 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')

        rd.run_gcs_retraction(self.project_id,
                              self.sandbox_dataset_id,
                              self.pid_table_id,
                              self.hpo_id,
                              folder='all_folders',
                              force_flag=True,
                              bucket=self.bucket,
                              site_bucket=self.site_bucket)

        total_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            file_name = file_path.split('/')[-1]
            blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
            actual_result_contents = blob.download_as_string().split(b'\n')
            # convert to list and remove header and last list item since it is a newline
            total_lines_post[file_name] = actual_result_contents[1:-1]

        for key in expected_lines_post:
            self.assertEqual(lines_to_remove[key], 0)
            self.assertListEqual(expected_lines_post[key],
                                 total_lines_post[key])

    @patch('retraction.retract_data_gcs.extract_pids_from_table')
    @patch('gcs_utils.get_drc_bucket')
    @patch('gcs_utils.get_hpo_bucket')
    def test_integration_five_person_data_retraction(self, mock_hpo_bucket,
                                                     mock_bucket,
                                                     mock_extract_pids):
        mock_hpo_bucket.return_value = self.site_bucket
        mock_bucket.return_value = self.bucket
        mock_extract_pids.return_value = self.pids
        expected_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            # generate results files
            file_name = file_path.split('/')[-1]
            table_name = file_name.split('.')[0]
            expected_lines_post[file_name] = []
            with open(file_path, 'rb') as f:
                # skip header
                next(f)
                expected_lines_post[file_name] = []
                for line in f:
                    line = line.strip()
                    if line != b'':
                        if not ((table_name in rd.PID_IN_COL1 and
                                 int(line.split(b",")[0]) in self.pids) or
                                (table_name in rd.PID_IN_COL2 and
                                 int(line.split(b",")[1]) in self.pids)):
                            expected_lines_post[file_name].append(line)

                # write file to cloud for testing
                blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')
                blob = self.gcs_bucket.blob(self.folder_prefix_2 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')

        rd.run_gcs_retraction(self.project_id,
                              self.sandbox_dataset_id,
                              self.pid_table_id,
                              self.hpo_id,
                              folder='all_folders',
                              force_flag=True,
                              bucket=self.bucket,
                              site_bucket=self.site_bucket)

        total_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            file_name = file_path.split('/')[-1]
            blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
            actual_result_contents = blob.download_as_string().split(b'\n')
            # convert to list and remove header and last list item since it is a newline
            total_lines_post[file_name] = actual_result_contents[1:-1]

        for key in expected_lines_post:
            self.assertListEqual(expected_lines_post[key],
                                 total_lines_post[key])

    def tearDown(self):
        self.client.empty_bucket(self.gcs_bucket)
Exemplo n.º 3
0
def _validation_done(bucket, folder):
    project_id = app_identity.get_application_id()
    storage_client = StorageClient(project_id)
    bucket = storage_client.bucket(bucket)
    return Blob(bucket=bucket,
                name=f'{folder}{common.PROCESSED_TXT}').exists(storage_client)
Exemplo n.º 4
0
class GcsClientTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.client = StorageClient(self.project_id)
        self.bucket_name: str = os.environ.get('BUCKET_NAME_FAKE')
        self.prefix: str = 'prefix'
        self.data: bytes = b'bytes'

        # NOTE: this needs to be in sorted order
        self.sub_prefixes: tuple = (f'{self.prefix}/a', f'{self.prefix}/b',
                                    f'{self.prefix}/c', f'{self.prefix}/d')
        self.client.empty_bucket(self.bucket_name)
        self._stage_bucket()

    def test_get_bucket_items_metadata(self):

        items_metadata: list = self.client.get_bucket_items_metadata(
            self.bucket_name)

        actual_metadata: list = [item['name'] for item in items_metadata]
        expected_metadata: list = [
            f'{prefix}/obj.txt' for prefix in self.sub_prefixes
        ]

        self.assertCountEqual(actual_metadata, expected_metadata)
        self.assertIsNotNone(items_metadata[0]['id'])

    def test_get_blob_metadata(self):

        bucket = self.client.get_bucket(self.bucket_name)
        blob_name: str = f'{self.sub_prefixes[0]}/obj.txt'

        blob = bucket.blob(blob_name)
        metadata: dict = self.client.get_blob_metadata(blob)

        self.assertIsNotNone(metadata['id'])
        self.assertIsNotNone(metadata['name'])
        self.assertIsNotNone(metadata['bucket'])
        self.assertIsNotNone(metadata['generation'])
        self.assertIsNotNone(metadata['metageneration'])
        self.assertIsNotNone(metadata['contentType'])
        self.assertIsNotNone(metadata['storageClass'])
        self.assertIsNotNone(metadata['size'])
        self.assertIsNotNone(metadata['md5Hash'])
        self.assertIsNotNone(metadata['crc32c'])
        self.assertIsNotNone(metadata['etag'])
        self.assertIsNotNone(metadata['updated'])
        self.assertIsNotNone(metadata['timeCreated'])

        self.assertEqual(metadata['name'], blob_name)
        self.assertEqual(metadata['size'], len(self.data))

    def test_empty_bucket(self):

        self.client.empty_bucket(self.bucket_name)
        items: list = self.client.list_blobs(self.bucket_name)

        # check that bucket is empty
        self.assertCountEqual(items, [])

    def test_list_sub_prefixes(self):

        items: list = self.client.list_sub_prefixes(self.bucket_name,
                                                    self.prefix)

        # Check same number of elements
        self.assertEqual(len(self.sub_prefixes), len(items))

        # Check same prefix
        for index, item in enumerate(items):
            self.assertEqual(item[:-1], self.sub_prefixes[index])

    def _stage_bucket(self):

        bucket = self.client.bucket(self.bucket_name)
        for sub_prefix in self.sub_prefixes:
            blob = bucket.blob(f'{sub_prefix}/obj.txt')
            blob.upload_from_string(self.data)

    def tearDown(self):
        self.client.empty_bucket(self.bucket_name)