def _to_map_job_config(cls, mr_spec, queue_name): """Converts model.MapreduceSpec back to JobConfig. This method allows our internal methods to use JobConfig directly. This method also allows us to expose JobConfig as an API during execution, despite that it is not saved into datastore. Args: mr_spec: model.MapreduceSpec. queue_name: queue name. Returns: The JobConfig object for this job. """ mapper_spec = mr_spec.mapper api_version = mr_spec.params.get("api_version", 0) old_api = api_version == 0 input_reader_cls = mapper_spec.input_reader_class() input_reader_params = input_readers._get_params(mapper_spec) if issubclass(input_reader_cls, input_reader.InputReader): input_reader_params = input_reader_cls.params_from_json( input_reader_params) output_writer_cls = mapper_spec.output_writer_class() output_writer_params = output_writers._get_params(mapper_spec) return cls( _lenient=old_api, job_name=mr_spec.name, job_id=mr_spec.mapreduce_id, mapper=util.for_name(mapper_spec.handler_spec), input_reader_cls=input_reader_cls, input_reader_params=input_reader_params, output_writer_cls=output_writer_cls, output_writer_params=output_writer_params, shard_count=mapper_spec.shard_count, queue_name=queue_name, user_params=mr_spec.params.get("user_params"), shard_max_attempts=mr_spec.params.get("shard_max_attempts"), done_callback_url=mr_spec.params.get("done_callback"), _force_writes=mr_spec.params.get("force_writes"), _base_path=mr_spec.params["base_path"], _task_max_attempts=mr_spec.params.get("task_max_attempts"), _task_max_data_processing_attempts=( mr_spec.params.get("task_max_data_processing_attempts")), _hooks_cls=util.for_name(mr_spec.hooks_class_name), _app=mr_spec.params.get("app_id"), _api_version=api_version)
def _to_map_job_config(cls, mr_spec, queue_name): """Converts model.MapreduceSpec back to JobConfig. This method allows our internal methods to use JobConfig directly. This method also allows us to expose JobConfig as an API during execution, despite that it is not saved into datastore. Args: mr_spec: model.MapreduceSpec. queue_name: queue name. Returns: The JobConfig object for this job. """ mapper_spec = mr_spec.mapper api_version = mr_spec.params.get("api_version", 0) old_api = api_version == 0 return cls(_lenient=old_api, job_name=mr_spec.name, job_id=mr_spec.mapreduce_id, mapper=util.for_name(mapper_spec.handler_spec), input_reader_cls=mapper_spec.input_reader_class(), input_reader_params=input_readers._get_params(mapper_spec), output_writer_cls=mapper_spec.output_writer_class(), output_writer_params=output_writers._get_params(mapper_spec), shard_count=mapper_spec.shard_count, queue_name=queue_name, user_params=mr_spec.params.get("user_params"), shard_max_attempts=mr_spec.params.get("shard_max_attempts"), done_callback_url=mr_spec.params.get("done_callback"), _force_writes=mr_spec.params.get("force_writes"), _base_path=mr_spec.params["base_path"], _task_max_attempts=mr_spec.params.get("task_max_attempts"), _task_max_data_processing_attempts=( mr_spec.params.get("task_max_data_processing_attempts")), _hooks_cls=util.for_name(mr_spec.hooks_class_name), _app=mr_spec.params.get("app_id"), _api_version=api_version)
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()