def shuffle(job_name, input_file_list, output_file_list, callback): """Shuffle mapreduce files using the shuffler service. Args: job_name: unique shuffle job name as string. input_file_list: list of files api file names to shuffle. Files should be in records format with serialized KeyValue protocol buffer as record. output_file_list: list of files api file names to store shuffle result. Files should not be finalized. They will be of records format with serialized KeyValues protocol buffer as record. callback: shuffle service call back specification. Can be either url - the task in default queue with default parameters will be enqueued. It can also be a dict with following keys: url: url to call back version: app version to call method: HTTP method to use (POST or GET) queue: queue name to enqueue a task in. Raises: ShufflerUnavailableError if shuffler service is not available. """ if not available(): raise ShufflerUnavailableError() request = file_service_pb.ShuffleRequest() response = file_service_pb.ShuffleResponse() request.set_shuffle_name(job_name) if isinstance(callback, dict): request.mutable_callback().set_url(callback["url"]) if "version" in callback: request.mutable_callback().set_app_version_id(callback["version"]) if "method" in callback: request.mutable_callback().set_method(callback["method"]) if "queue" in callback: request.mutable_callback().set_queue(callback["queue"]) else: request.mutable_callback().set_url(callback) request.set_shuffle_size_bytes(0) for file_name in input_file_list: shuffle_input = request.add_input() shuffle_input.set_format( file_service_pb.ShuffleEnums.RECORDS_KEY_VALUE_PROTO_INPUT) shuffle_input.set_path(file_name) shuffle_output = request.mutable_output() shuffle_output.set_format( file_service_pb.ShuffleEnums.RECORDS_KEY_MULTI_VALUE_PROTO_OUTPUT) for file_name in output_file_list: shuffle_output.add_path(file_name) files._make_call("Shuffle", request, response)
def _listdir_local(path, kwargs): """Dev app server version of listdir. See listdir for doc. """ request = file_service_pb.ListDirRequest() response = file_service_pb.ListDirResponse() request.set_path(path) if kwargs and kwargs.has_key('marker'): request.set_marker(kwargs['marker']) if kwargs and kwargs.has_key('max-keys'): request.set_max_keys(kwargs['max-keys']) if kwargs and kwargs.has_key('prefix'): request.set_prefix(kwargs['prefix']) files._make_call('ListDir', request, response) return response.filenames_list()