def __init__(self, labels, inputCol=None, outputCol=None, handleInvalid='error', defaultValue=0.0): """ __init__(self, labels, inputCol=None, outputCol=None, handleInvalid='error', defaultValue=0.0) labels is a dict {string: double} handleInvalid: how to handle missing labels: 'error' (throw an error), or 'keep' (map to the default value) """ assert handleInvalid in [ 'error', 'keep' ], 'Invalid value for handleInvalid: {}'.format(handleInvalid) super(StringMap, self).__init__() labels_scala_map = _jvm() \ .scala \ .collection \ .JavaConverters \ .mapAsScalaMapConverter(labels) \ .asScala() \ .toMap(_jvm().scala.Predef.conforms()) handle_invalid_jvm = _jvm( ).ml.combust.mleap.core.feature.StringMapHandleInvalid.__getattr__( handleInvalid.capitalize() + '$').__getattr__('MODULE$') string_map_model = self._new_java_obj( "ml.combust.mleap.core.feature.StringMapModel", labels_scala_map, handle_invalid_jvm, defaultValue) self._java_obj = self._new_java_obj( "org.apache.spark.ml.mleap.feature.StringMap", self.uid, string_map_model) self.setInputCol(inputCol) self.setOutputCol(outputCol)
def testLongRDDToH2OFrame(spark, hc): min = _jvm().Integer.MIN_VALUE - 1 max = _jvm().Integer.MAX_VALUE + 1 rdd = spark.sparkContext.parallelize([1, min, max]) h2o_frame = hc.asH2OFrame(rdd) assert h2o_frame[0, 0] == 1 assert h2o_frame[1, 0] == min assert h2o_frame[2, 0] == max unit_test_utils.asert_h2o_frame(h2o_frame, rdd)
def testNumericRDDtoH2OFrameWithValueTooBig(spark, hc): min = _jvm().Long.MIN_VALUE - 1 max = _jvm().Long.MAX_VALUE + 1 rdd = spark.sparkContext.parallelize([1, min, max]) h2o_frame = hc.asH2OFrame(rdd) assert h2o_frame[0, 0] == str(1) assert h2o_frame[1, 0] == str(min) assert h2o_frame[2, 0] == str(max) unit_test_utils.asert_h2o_frame(h2o_frame, rdd)
def __init__(self, labels={}, inputCol=None, outputCol=None, handleInvalid='error', defaultValue=0.0): """ :param labels: a dict {string: double} :param handleInvalid: how to handle missing labels: 'error' (throw), or 'keep' (map to defaultValue) :param defaultValue: value to use if key is not found in labels """ """ labels must be a dict {string: double} or a spark DataFrame with columns inputCol & outputCol handleInvalid: """ super(StringMap, self).__init__() def validate_args(): """ validate args early to avoid failing at Py4j with some hard to interpret error message """ assert handleInvalid in [ 'error', 'keep' ], 'Invalid value for handleInvalid: {}'.format(handleInvalid) assert isinstance(labels, dict), 'labels must be a dict, got: {}'.format( type(labels)) for (key, value) in labels.items(): assert isinstance(key, six.string_types), \ 'label keys must be a string type, got: {}'.format(type(key)) assert isinstance( value, float), 'label values must be float, got: {}'.format( type(key)) validate_args() labels_scala_map = _jvm() \ .scala \ .collection \ .JavaConverters \ .mapAsScalaMapConverter(labels) \ .asScala() \ .toMap(_jvm().scala.Predef.conforms()) handle_invalid_jvm = _jvm( ).ml.combust.mleap.core.feature.StringMapHandleInvalid.__getattr__( handleInvalid.capitalize() + '$').__getattr__('MODULE$') string_map_model = self._new_java_obj( "ml.combust.mleap.core.feature.StringMapModel", labels_scala_map, handle_invalid_jvm, defaultValue) self._java_obj = self._new_java_obj( "org.apache.spark.ml.mleap.feature.StringMap", self.uid, string_map_model) self.setInputCol(inputCol) self.setOutputCol(outputCol)
def __init__( self, operation=None, inputA=None, inputB=None, outputCol=None, ): """ Computes the mathematical binary `operation` over the input columns A and B. :param operation: BinaryOperation to specify the operation type :param inputA: column name for the left side of operation (string) :param inputB: column name for the right side of operation (string) :param outputCol: output column name (string) NOTE: `operation` is not a JavaParam because the underlying MathBinary scala object uses a MathBinaryModel to store the info about the binary operation. `operation` has a None default value even though it should *never* be None. A None value is necessary upon deserialization to instantiate a MathBinary without errors. Afterwards, pyspark sets the _java_obj to the deserialized scala object, which encodes the operation. """ super(MathBinary, self).__init__() # if operation=None, it means that pyspark is reloading the model # from disk and calling this method without args. In such case we don't # need to set _java_obj here because pyspark will set it after creation # # if operation is not None, we can proceed to instantiate the scala classes if operation: scalaBinaryOperation = jvm_scala_object( _jvm().ml.combust.mleap.core.feature.BinaryOperation, operation.name) # IMPORTANT: defaults for missing values are forced to None. # I've found an issue when setting default values for A and B, # Remember to treat your missing values before the MathBinary # (for example, you could use an Imputer) scalaMathBinaryModel = _jvm( ).ml.combust.mleap.core.feature.MathBinaryModel( scalaBinaryOperation, Some(None), Some(None)) self._java_obj = self._new_java_obj( "org.apache.spark.ml.mleap.feature.MathBinary", self.uid, scalaMathBinaryModel, ) self._setDefault() self.setParams(inputA=inputA, inputB=inputB, outputCol=outputCol)
def convert(value): if value is None: raise TypeError("None is not allowed.") elif isinstance(value, JavaObject): return value elif isinstance(value, DenseVector): package = getattr(_jvm().ai.h2o.sparkling.ml.params, "ConversionUtils$") module = package.__getattr__("MODULE$") return _jvm().org.apache.spark.ml.linalg.DenseVector( module.toDoubleArray(H2OTypeConverters.toListFloat()(value.values))) else: raise TypeError("Invalid type. The expected type is pyspark.ml.linalg.DenseVector.")
def __init__(self, modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[], nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True, seed=-1, distribution="AUTO", ntrees=50, maxDepth=5, minRows=10.0, nbins=20, nbinsCats=1024, minSplitImprovement=1e-5, histogramType="AUTO", r2Stopping=java_max_double_value, nbinsTopLevel=1<<10, buildTreeOneNode=False, scoreTreeInterval=0, sampleRate=1.0, sampleRatePerClass=None, colSampleRateChangePerLevel=1.0, colSampleRatePerTree=1.0, learnRate=0.1, learnRateAnnealing=1.0, colSampleRate=1.0, maxAbsLeafnodePred=java_max_double_value, predNoiseBandwidth=0.0, convertUnknownCategoricalLevelsToNa=False, foldCol=None, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False, **deprecatedArgs): Initializer.load_sparkling_jar() super(H2OGBM, self).__init__() self._java_obj = self._new_java_obj("ai.h2o.sparkling.ml.algos.H2OGBM", self.uid) self._setDefault(modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[], nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True, seed=-1, distribution="AUTO", ntrees=50, maxDepth=5, minRows=10.0, nbins=20, nbinsCats=1024, minSplitImprovement=1e-5, histogramType="AUTO", r2Stopping=_jvm().Double.MAX_VALUE, nbinsTopLevel=1<<10, buildTreeOneNode=False, scoreTreeInterval=0, sampleRate=1.0, sampleRatePerClass=None, colSampleRateChangePerLevel=1.0, colSampleRatePerTree=1.0, learnRate=0.1, learnRateAnnealing=1.0, colSampleRate=1.0, maxAbsLeafnodePred=_jvm().Double.MAX_VALUE, predNoiseBandwidth=0.0, convertUnknownCategoricalLevelsToNa=False, foldCol=None, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False) kwargs = get_input_kwargs(self) self.setParams(**kwargs)
def _get_mleap_schema(dataframe): """ :param dataframe: A PySpark dataframe object :return: The schema of the supplied dataframe, in MLeap format. This serialized object of type `ml.combust.mleap.core.types.StructType`, represented as a JSON dictionary. """ from pyspark.ml.util import _jvm ReflectionUtil = _jvm().py4j.reflection.ReflectionUtil # Convert the Spark dataframe's schema to an MLeap schema object. # This is equivalent to the Scala function call # `org.apache.spark.sql.mleap.TypeConverters.sparkSchemaToMleapSchema(dataframe)` tc_clazz = ReflectionUtil.classForName( "org.apache.spark.sql.mleap.TypeConverters$") tc_inst = tc_clazz.getField("MODULE$").get(tc_clazz) mleap_schema_struct = tc_inst.sparkSchemaToMleapSchema(dataframe._jdf) # Obtain a JSON representation of the MLeap schema object # This is equivalent to the Scala function call # `ml.combust.mleap.json.JsonSupport.MleapStructTypeFormat().write(mleap_schema_struct)` js_clazz = ReflectionUtil.classForName( "ml.combust.mleap.json.JsonSupport$") js_inst = js_clazz.getField("MODULE$").get(js_clazz) mleap_schema_json = js_inst.MleapStructTypeFormat().write( mleap_schema_struct) return json.loads(mleap_schema_json.toString())
def convert(value): package = getattr(_jvm().ai.h2o.sparkling.ml.params, "EnumParamValidator$") module = package.__getattr__("MODULE$") return module.getValidatedEnumValue( enumClass, H2OTypeConverters.toString()(value))
def convert(value): javaObj = H2OTypeConverters.toJavaObj()(value) if javaObj is None: return None else: package = getattr(_jvm().ai.h2o.sparkling.ml.algos, "H2OGridSearch$SupportedAlgos$") module = package.__getattr__("MODULE$") module.checkIfSupported(javaObj) return javaObj
def getOrCreate(spark=None, conf=None): """ Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used. If the values are not found the default values are used in most of the cases. The default cluster mode is internal, ie. spark.ext.h2o.external.cluster.mode=false :param spark: Spark Context or Spark Session or H2OConf :param conf: H2O configuration as instance of H2OConf :return: instance of H2OContext """ if spark is not None and not isinstance(spark, H2OConf): warnings.warn( "Method getOrCreate with spark argument is deprecated. Please use either just getOrCreate() or if you need " "to pass extra H2OConf, use getOrCreate(conf). The spark argument will be removed in release 3.32." ) # Workaround for bug in Spark 2.1 as SparkSession created in PySpark is not seen in Java # and call SparkSession.builder.getOrCreate on Java side creates a new session, which is not # desirable activeSession = SparkSession._instantiatedSession if activeSession is not None: jvm = activeSession.sparkContext._jvm jvm.org.apache.spark.sql.SparkSession.setDefaultSession( activeSession._jsparkSession) if spark is not None and isinstance(spark, H2OConf): selected_conf = spark elif conf is not None: selected_conf = conf else: selected_conf = H2OConf() if selected_conf.runsInExternalClusterMode(): selected_conf.set("spark.ext.h2o.rest.api.based.client", "true") h2o_context = H2OContext() # Create backing H2OContext package = getattr(_jvm().org.apache.spark.h2o, "H2OContext$") module = package.__getattr__("MODULE$") jhc = module.getOrCreate(selected_conf._jconf) h2o_context._jhc = jhc h2o_context._conf = selected_conf h2o_context._client_ip = jhc.h2oLocalClientIp() h2o_context._client_port = jhc.h2oLocalClientPort() # Create H2O REST API client if not h2o_context.__isClientConnected( ) or not H2OContext.__isConnected: h2o_context.__h2o_connect() H2OContext.__isConnected = True h2o_context.__setClientConnected() print(h2o_context) return h2o_context
def _new_java_obj(java_class, *args): """ Returns a new Java object. """ sc = SparkContext._active_spark_context java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def __init__(self, spark=None): try: if spark is not None: warnings.warn( "Constructor H2OConf(spark) with spark argument is deprecated. Please use just H2OConf(). " "The argument will be removed in release 3.32.") Initializer.load_sparkling_jar() self._jconf = _jvm().org.apache.spark.h2o.H2OConf() except: raise
def _new_java_obj(java_class: str, *args: Any) -> "JavaObject": """ Returns a new Java object. """ sc = SparkContext._active_spark_context assert sc is not None java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def createFromMojo(pathToMojo, settings=H2OMOJOSettings.default()): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() javaModel = _jvm( ).ai.h2o.sparkling.ml.models.H2OMOJOModel.createFromMojo( pathToMojo, settings.toJavaObject()) className = javaModel.getClass().getSimpleName() if className == "H2OSupervisedMOJOModel": return H2OSupervisedMOJOModel(javaModel) elif className == "H2OUnsupervisedMOJOModel": return H2OUnsupervisedMOJOModel(javaModel) else: return H2OMOJOModel(javaModel)
def __prepareSparkDataForConversion(sparkData): if isinstance(sparkData, DataFrame): return sparkData elif sparkData.isEmpty(): return sparkData.toDF() else: session = SparkSession.builder.getOrCreate() first = sparkData.first() if isinstance(first, (str, bool, numbers.Integral, float)): if isinstance(first, str): return session.createDataFrame(sparkData, StringType()) elif isinstance(first, bool): return session.createDataFrame(sparkData, BooleanType()) elif (isinstance(sparkData.min(), numbers.Integral) and isinstance(sparkData.max(), numbers.Integral)): if sparkData.min( ) >= _jvm().Integer.MIN_VALUE and sparkData.max() <= _jvm( ).Integer.MAX_VALUE: return session.createDataFrame(sparkData, IntegerType()) elif sparkData.min() >= _jvm( ).Long.MIN_VALUE and sparkData.max() <= _jvm( ).Long.MAX_VALUE: return session.createDataFrame(sparkData, LongType()) else: warnings.warn( "Maximal or minimal number in RDD is too big to convert to Java. Treating numbers as strings." ) return session.createDataFrame(sparkData, StringType()) elif isinstance(first, float): ## Spark would fail when creating data frame if there is int type in RDD[Float] ## Convert explicitly all to float return session.createDataFrame( sparkData.map(lambda x: float(x)), FloatType()) else: raise ValueError('Unreachable code') else: return session.createDataFrame(sparkData)
def convert(value): package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$") module = package.__getattr__("MODULE$") if nullEnabled: converter = H2OTypeConverters.toNullableListString() else: converter = H2OTypeConverters.toListString() javaArray = module.getValidatedEnumValues(enumClass, converter(value), nullEnabled) if javaArray is None: return None else: return list(javaArray)
def __init__(self, operation=None, inputCol=None, outputCol=None): """ Computes the mathematical unary `operation` over the input column. NOTE: `operation` is not a JavaParam because the underlying MathUnary scala object uses a MathUnaryModel to store the info about the unary operation (sin, tan, etc.), not a JavaParam string. `operation` has a None default value even though it should *never* be None. A None value is necessary upon deserialization to instantiate a MathUnary without errors. Afterwards, pyspark sets the _java_obj to the deserialized scala object, which encodes the operation. """ super(MathUnary, self).__init__() # if operation=None, it means that pyspark is reloading the model # from disk and calling this method without args. In such case we don't # need to set _java_obj here because pyspark will set it after creation # # if operation is not None, we can proceed to instantiate the scala classes if operation: scalaUnaryOperation = jvm_scala_object( _jvm().ml.combust.mleap.core.feature.UnaryOperation, operation.name) scalaMathUnaryModel = _jvm( ).ml.combust.mleap.core.feature.MathUnaryModel(scalaUnaryOperation) self._java_obj = self._new_java_obj( "org.apache.spark.ml.mleap.feature.MathUnary", self.uid, scalaMathUnaryModel, ) self._setDefault() self.setParams(inputCol=inputCol, outputCol=outputCol)
def ScalaNone(): return jvm_scala_object(_jvm().scala, "None")
def __init__( self, operation=None, inputA=None, inputB=None, outputCol=None, defaultA=None, defaultB=None, ): """ Computes the mathematical binary `operation` over the input columns A and B. :param operation: BinaryOperation to specify the operation type :param inputA: column name for the left side of operation (string) :param inputB: column name for the right side of operation (string) :param outputCol: output column name (string) :param defaultA: Default to use instead of inputA. This will only be used when inputA is None. For example when defaultA=4, operation=BinaryOperation.Multiply and inputB=f1, then all entries of col f1 will be multiplied by 4. :param defaultB: Default to use instead of inputB. This will only be used when inputB is None. For example when defaultB=4, operation=BinaryOperation.Multiply and inputA=f1, then all entries of col f1 will be multiplied by 4. NOTE: `operation`, `defaultA`, `defaultB` is not a JavaParam because the underlying MathBinary scala object uses a MathBinaryModel to store the info about the binary operation. `operation` has a None default value even though it should *never* be None. A None value is necessary upon deserialization to instantiate a MathBinary without errors. Afterwards, pyspark sets the _java_obj to the deserialized scala object, which encodes the operation (as well as the default values for A and B). """ super(MathBinary, self).__init__() # if operation=None, it means that pyspark is reloading the model # from disk and calling this method without args. In such case we don't # need to set _java_obj here because pyspark will set it after creation # # if operation is not None, we can proceed to instantiate the scala classes if operation: scalaBinaryOperation = jvm_scala_object( _jvm().ml.combust.mleap.core.feature.BinaryOperation, operation.name) scalaMathBinaryModel = _jvm( ).ml.combust.mleap.core.feature.MathBinaryModel( scalaBinaryOperation, Some(defaultA) if defaultA else ScalaNone(), Some(defaultB) if defaultB else ScalaNone(), ) self._java_obj = self._new_java_obj( "org.apache.spark.ml.mleap.feature.MathBinary", self.uid, scalaMathBinaryModel, ) self._setDefault() self.setParams(inputA=inputA, inputB=inputB, outputCol=outputCol)
def createFromMojo(pathToMojo, settings=H2OMOJOSettings.default()): # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths Initializer.load_sparkling_jar() javaModel = _jvm().ai.h2o.sparkling.ml.models.H2OMOJOPipelineModel.createFromMojo(pathToMojo, settings.toJavaObject()) return H2OMOJOPipelineModel(javaModel)
def Some(value): """ Instantiate a scala Some object. Useful when scala code takes in an Option[<value>] """ return _jvm().scala.Some(value)
def fromBytes(cls, bytes_array): """ Constructs a score model from PMML in an array of bytes. """ java_model = _jvm().org.pmml4s.spark.ScoreModel.fromBytes(bytes_array) return cls(java_model)
def fromString(cls, s): """ Constructs a score model from PMML in a String. """ java_model = _jvm().org.pmml4s.spark.ScoreModel.fromString(s) return cls(java_model)
def fromFile(cls, name): """ Constructs a score model from PMML file with given pathname. """ java_model = _jvm().org.pmml4s.spark.ScoreModel.fromFile(name) return cls(java_model)
def _empty_java_param_map(): """ Returns an empty Java ParamMap reference. """ return _jvm().org.apache.spark.ml.param.ParamMap()
def _load_java_obj(cls, java_class): """Load the peer Java object of the ML instance.""" java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) return java_obj
def __init__(self): super(SimpleBinaryMetrics, self).__init__() self.BinaryMetrics = _jvm().ml.dhs.modelmonitor.BinaryMetrics
def convert(value): package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$") return package.__getattr__("MODULE$").getValidatedEnumValue(enumClass, TypeConverters.toString(value))
def __init__(self): super(SimpleSparkSerializer, self).__init__() self._java_obj = _jvm().ml.combust.mleap.spark.SimpleSparkSerializer()