def assert_new_tmp_bucket(self, location, **runner_kwargs): """Assert that if we create an DataprocJobRunner with the given keyword args, it'll create a new tmp bucket with the given location constraint. """ bucket_cache = self._gcs_client._cache_buckets existing_buckets = set(bucket_cache.keys()) runner = DataprocJobRunner(conf_paths=[], **runner_kwargs) bucket_name, path = parse_gcs_uri(runner._cloud_tmp_dir) runner._create_fs_tmp_bucket(bucket_name, location=location) self.assertTrue(bucket_name.startswith('mrjob-')) self.assertNotIn(bucket_name, existing_buckets) self.assertEqual(path, 'tmp/') current_bucket = bucket_cache[bucket_name] self.assertEqual(current_bucket['location'], location) # Verify that we setup bucket lifecycle rules of 28-day retention first_lifecycle_rule = current_bucket['lifecycle']['rule'][0] self.assertEqual(first_lifecycle_rule['action'], dict(type='Delete')) self.assertEqual(first_lifecycle_rule['condition'], dict(age=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS))
def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def put_gcs(self, gcs_uri, data): """Put data at gcs_uri, creating a bucket if necessary""" bucket, name = parse_gcs_uri(gcs_uri) try: self._fs.get_bucket(bucket) except google_errors.HttpError: self._fs.create_bucket(project=_TEST_PROJECT, name=bucket) bytes_io_obj = BytesIO(data) self.upload_io(bytes_io_obj, gcs_uri)
def put_gcs_multi(self, gcs_uri_to_data_map): client = self.storage_client() for uri, data in gcs_uri_to_data_map.items(): bucket_name, blob_name = parse_gcs_uri(uri) bucket = client.bucket(bucket_name) if not bucket.exists(): bucket.create() blob = bucket.blob(blob_name) blob.upload_from_string(data)
def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) self.fs.put(path, gcs_uri, chunk_size=self._fs_chunk_size()) self._wait_for_fs_sync()
def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync()
def download_io(self, src_uri, io_obj): """ Clobber GCSFilesystem._download_io """ bucket, name = parse_gcs_uri(src_uri) object_dict = _get_deep(self._cache_objects, [bucket, name]) if not object_dict: raise Exception object_data = object_dict['_data'] io_obj.write(object_data) return io_obj
def _test_cloud_tmp_cleanup(self, mode, tmp_len): stdin = BytesIO(b"foo\nbar\n") mr_job = MRTwoStepJob(["-r", "dataproc", "-v", "-", "--cleanup", mode]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir) runner.run() # this is set and unset before we can get at it unless we do this list(runner.stream_output()) objects_in_bucket = self._gcs_fs.api_client._cache_objects[tmp_bucket] self.assertEqual(len(objects_in_bucket), tmp_len)
def _test_cloud_tmp_cleanup(self, mode, tmp_len): stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir) runner.run() # this is set and unset before we can get at it unless we do this list(runner.cat_output()) objects_in_bucket = self._gcs_fs.api_client._cache_objects[tmp_bucket] self.assertEqual(len(objects_in_bucket), tmp_len)
def upload_io(self, io_obj, dest_uri): """ Clobber GCSFilesystem._upload_io """ bucket, name = parse_gcs_uri(dest_uri) assert bucket in self._cache_buckets io_obj.seek(0) data = io_obj.read() # TODO - io_obj.close() ? Not sure if callers of this function would expect their io_objs to be closed object_resp = _insert_object_resp(bucket=bucket, name=name, data=data) _set_deep(self._cache_objects, [bucket, name], object_resp) return object_resp
def upload_io(self, io_obj, dest_uri): """ Clobber GCSFilesystem._upload_io """ bucket, name = parse_gcs_uri(dest_uri) assert bucket in self._cache_buckets io_obj.seek(0) data = io_obj.read() # TODO - io_obj.close() ? Not sure if callers of this function would # expect their io_objs to be closed object_resp = _insert_object_resp(bucket=bucket, name=name, data=data) _set_deep(self._cache_objects, [bucket, name], object_resp) return object_resp
def _test_cloud_tmp_cleanup(self, mode, tmp_len): stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir) runner.run() # this is set and unset before we can get at it unless we do this list(runner.cat_output()) fs = runner.fs # with statement finishes, cleanup runs self.assertEqual(len(list(fs.client.bucket(tmp_bucket).list_blobs())), tmp_len)