def test_file_retrieval(self): file_manager = InMemoryFileManager({ 'path/to/a': b'a', 'path/to/b': b'b' * 37 }) retrieval_service = artifact_service.ArtifactRetrievalService( file_manager.file_reader, chunk_size=10) dep_a = self.file_artifact('path/to/a') self.assertEqual( retrieval_service.ResolveArtifacts( beam_artifact_api_pb2.ResolveArtifactsRequest( artifacts=[dep_a])), beam_artifact_api_pb2.ResolveArtifactsResponse( replacements=[dep_a])) self.assertEqual( list( retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest(artifact=dep_a))), [beam_artifact_api_pb2.GetArtifactResponse(data=b'a')]) self.assertEqual( list( retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest( artifact=self.file_artifact('path/to/b')))), [ beam_artifact_api_pb2.GetArtifactResponse(data=b'b' * 10), beam_artifact_api_pb2.GetArtifactResponse(data=b'b' * 10), beam_artifact_api_pb2.GetArtifactResponse(data=b'b' * 10), beam_artifact_api_pb2.GetArtifactResponse(data=b'b' * 7) ])
def test_embedded_retrieval(self): retrieval_service = artifact_service.ArtifactRetrievalService(None) embedded_dep = self.embedded_artifact(b'some_data') self.assertEqual( list( retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest( artifact=embedded_dep))), [beam_artifact_api_pb2.GetArtifactResponse(data=b'some_data')])
def test_url_retrieval(self): retrieval_service = artifact_service.ArtifactRetrievalService(None) url_dep = beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.URL.urn, type_payload=beam_runner_api_pb2.ArtifactUrlPayload( url='file:' + quote(__file__)).SerializeToString()) content = b''.join([ r.data for r in retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest(artifact=url_dep)) ]) with open(__file__, 'rb') as fin: self.assertEqual(content, fin.read())
def store_artifact(artifact, service, dest_dir): hasher = hashlib.sha256() with tempfile.NamedTemporaryFile(dir=dest_dir, delete=False) as fout: for block in service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest(artifact=artifact)): hasher.update(block.data) fout.write(block.data) return beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2.ArtifactFilePayload( path=fout.name, sha256=hasher.hexdigest()).SerializeToString(), role_urn=artifact.role_urn, role_payload=artifact.role_payload)
def resolve_as_files(retrieval_service, file_writer, dependencies): """Translates a set of dependencies into file-based dependencies.""" # Resolve until nothing changes. This ensures that they can be fetched. resolution = retrieval_service.ResolveArtifactss( beam_artifact_api_pb2.ResolveArtifactsRequest( artifacts=dependencies, # Anything fetchable will do. # TODO(robertwb): Take advantage of shared filesystems, urls. preferred_urns=[], )) dependencies = resolution.replacements # Fetch each of the dependencies, using file_writer to store them as # file-based artifacts. # TODO(robertwb): Consider parallelizing the actual writes. for dep in dependencies: if dep.role_urn == common_urns.artifact_roles.STAGING_TO.urn: base_name = os.path.basename( proto_utils.parse_Bytes( dep.role_payload, beam_runner_api_pb2.ArtifactStagingToRolePayload).staged_name) else: base_name = None unique_name = '-'.join( filter( None, [hashlib.sha256(dep.SerializeToString()).hexdigest(), base_name])) file_handle, path = file_writer(unique_name) with file_handle as fout: for chunk in retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest(artifact=dep)): fout.write(chunk.data) yield beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2.ArtifactFilePayload( path=path).SerializeToString(), role_urn=dep.role_urn, role_payload=dep.role_payload)
def retrieve_artifact(retrieval_service, retrieval_token, name): return b''.join(chunk.data for chunk in retrieval_service.GetArtifact( beam_artifact_api_pb2.GetArtifactRequest( retrieval_token=retrieval_token, name=name)))