def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( sc._jsc.sc(), path) py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray() py_predictions = _java2py(sc, java_model.predictionVector()).toArray() return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic)
def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load( sc._jsc.sc(), path) py_labels = _java2py(sc, java_model.labels()) py_pi = _java2py(sc, java_model.pi()) py_theta = _java2py(sc, java_model.theta()) return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
def __init__(self, regressionCoeff=None, d=0, q=0, coefficients=None, hasIntercept=False, jmodel=None, sc=None): """ Parameters ---------- regressionCoeff: coefficients for regression , including intercept , for example. if model has 3 regressors then length of [[regressionCoeff]] is 4 arimaOrders: p,d,q for the arima error structure, length of [[arimaOrders]] must be 3 arimaCoeff: AR, d, and MA terms, length of arimaCoeff = p+d+q """ assert sc != None, "Missing SparkContext" self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.RegressionARIMAModel( _py2java_double_array(self._ctx, regressionCoeff), _py2java_int_array(self._ctx, arimaOrders), _py2scala_arraybuffer(self._ctx, arimaCoeff), ) else: self._jmodel = jmodel self.regressionCoeff = _java2py(sc, self._jmodel.regressionCoeff()) self.arimaOrders = _java2py(sc, self._jmodel.arimaOrders()) self.arimaCoeff = _java2py(sc, self._jmodel.arimaCoeff())
def load(cls, sc, path): """Load a IsotonicRegressionModel.""" java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( sc._jsc.sc(), path) py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray() py_predictions = _java2py(sc, java_model.predictionVector()).toArray() return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic)
def __init__(self, regressionCoeff=None, d=0, q=0, coefficients=None, hasIntercept=False, jmodel=None, sc=None): """ Parameters ---------- regressionCoeff: coefficients for regression , including intercept , for example. if model has 3 regressors then length of [[regressionCoeff]] is 4 arimaOrders: p,d,q for the arima error structure, length of [[arimaOrders]] must be 3 arimaCoeff: AR, d, and MA terms, length of arimaCoeff = p+d+q """ assert sc != None, "Missing SparkContext" self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.RegressionARIMAModel( _py2java_double_array(self._ctx, regressionCoeff), _py2java_int_array(self._ctx, arimaOrders), _py2scala_arraybuffer(self._ctx, arimaCoeff)) else: self._jmodel = jmodel self.regressionCoeff = _java2py(sc, self._jmodel.regressionCoeff()) self.arimaOrders = _java2py(sc, self._jmodel.arimaOrders()) self.arimaCoeff = _java2py(sc, self._jmodel.arimaCoeff())
def call_scala_method(py_class, scala_method, df, *args): """Given a Python class, calls a method from its Scala equivalent """ sc = df.sql_ctx._sc # Gets the Java class from the JVM, given the name built from the Python class java_class = getattr(sc._jvm, get_jvm_class(py_class)) # Converts all columns into doubles and access it as Java DF jdf = df.select(*(F.col(col).astype('double') for col in df.columns))._jdf # Creates a Java object from both Java class and DataFrame java_obj = java_class(jdf) # Converts remaining args from Python to Java as well args = [_py2java(sc, a) for a in args] # Gets method from Java Object and passes arguments to it to get results java_res = getattr(java_obj, scala_method)(*args) # Converts results from Java back to Python res = _java2py(sc, java_res) # If result is an RDD, it could be the case its elements are still # serialized tuples from Scala... if isinstance(res, RDD): try: # Takes the first element from the result, to check what it is first = res.take(1)[0] # If it is a dictionary, we need to check its value if isinstance(first, dict): first = list(first.values())[0] # If the value is a scala tuple, we need to deserialize it if first.startswith('scala.Tuple'): serde = sc._jvm.org.apache.spark.mllib.api.python.SerDe # We assume it is a Tuple2 and deserialize it java_res = serde.fromTuple2RDD(java_res) # Finally, we convert the deserialized result from Java to Python res = _java2py(sc, java_res) except IndexError: pass return res
def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load( sc._jsc.sc(), path) # Can not unpickle array.array from Pyrolite in Python3 with "bytes" py_labels = _java2py(sc, java_model.labels(), "latin1") py_pi = _java2py(sc, java_model.pi(), "latin1") py_theta = _java2py(sc, java_model.theta(), "latin1") return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
def load(cls, sc: SparkContext, path: str) -> "IsotonicRegressionModel": """Load an IsotonicRegressionModel.""" assert sc._jvm is not None java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( sc._jsc.sc(), path) py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray() py_predictions = _java2py(sc, java_model.predictionVector()).toArray() return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic)
def __init__(self, p=0, d=0, q=0, coefficients=None, hasIntercept=False, jmodel=None, sc=None): self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARIMAModel(p, d, q, _py2java(self._ctx, coefficients), hasIntercept) else: self._jmodel = jmodel self.p = _java2py(sc, self._jmodel.p()) self.d = _java2py(sc, self._jmodel.d()) self.q = _java2py(sc, self._jmodel.q()) self.coefficients = _java2py(sc, self._jmodel.coefficients()) self.has_intercept = _java2py(sc, self._jmodel.hasIntercept())
def __init__(self, p=0, d=0, q=0, coefficients=None, hasIntercept=True, jmodel=None, sc=None): assert sc != None, "Missing SparkContext" self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARIMAModel(p, d, q, _py2java_double_array(self._ctx, coefficients), hasIntercept) else: self._jmodel = jmodel self.p = _java2py(sc, self._jmodel.p()) self.d = _java2py(sc, self._jmodel.d()) self.q = _java2py(sc, self._jmodel.q()) self.coefficients = _java2py(sc, self._jmodel.coefficients()) self.has_intercept = _java2py(sc, self._jmodel.hasIntercept())
def load(cls, sc: SparkContext, path: str) -> "NaiveBayesModel": """ Load a model from the given path. """ assert sc._jvm is not None java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load( sc._jsc.sc(), path ) # Can not unpickle array.array from Pickle in Python3 with "bytes" py_labels = _java2py(sc, java_model.labels(), "latin1") py_pi = _java2py(sc, java_model.pi(), "latin1") py_theta = _java2py(sc, java_model.theta(), "latin1") return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() model = RidgeRegressionModel(weights, intercept) return model
def arima_ts(df): sc = SparkContext.getOrCreate() train = df.filter(df['date'].between('2013-01-01', '2014-11-01')) test = df.filter(df['date'].between('2014-11-01', '2015-05-01')) tr = numpy.array(train.select("sales").collect()).flatten() te = numpy.array(test.select("sales").collect()).flatten() nte = len(te) #model = autofit(Vectors.dense(tr), sc=sc) model = fit_model(p=0, d=1, q=0, ts=Vectors.dense(tr), sc=sc) prev = model.forecast(Vectors.dense(tr), nte) x = _java2py(sc, prev)[len(tr):] #print("ARIMA spark-ts R2: ", r2_score(te, x)) test = test.toPandas() test = test.set_index('date') df = df.toPandas() df = df.set_index('date') x = pd.DataFrame(x, index=test.index, columns=['prediction']) pd.concat([test, x], axis=1).plot() pd.concat([df, x], axis=1).plot() return r2_score(te, x)
def perform_pca(matrix, row_count, nr_principal_components=2): """Return principal components of the input matrix. This function uses MLlib's ``RowMatrix`` to compute principal components. Args: matrix: An RDD[int, (int, float)] representing a sparse matrix. This is returned by ``center_matrix`` but it is not required to center the matrix first. row_count: The size (N) of the N x N ``matrix``. nr_principal_components: Number of components we want to obtain. This value must be less than or equal to the number of rows in the input square matrix. Returns: An array of ``nr_principal_components`` columns, and same number of rows as the input ``matrix``. This array is a ``numpy`` array. """ py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row)) sc = pyspark.SparkContext._active_spark_context java_rdd = mllib_common._py2java(sc, py_rdd) scala_rdd = java_rdd.rdd() sc = pyspark.SparkContext._active_spark_context row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed. RowMatrix(scala_rdd) ) pca = row_matrix.computePrincipalComponents(nr_principal_components) pca = mllib_common._java2py(sc, pca) return pca.toArray()
def gor(self, qry): df = _py2java(sc, self) ReflectionUtil = spark._jvm.py4j.reflection.ReflectionUtil Rowclass = ReflectionUtil.classForName("org.apache.spark.sql.Row") ct = spark._jvm.scala.reflect.ClassTag.apply(Rowclass) gds = spark._jvm.org.gorpipe.spark.GorDatasetFunctions(df, ct, ct) return _java2py(sc, gds.gor(qry, True, sgs))
def load(cls, sc, path): """ Load a model from the given path. """ java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load( sc._jsc.sc(), path) return KMeansModel(_java2py(sc, java_model.clusterCenters()))
def gradient_log_likelihood_css_arma(self, diffedy): """ Calculates the gradient for the log likelihood function using CSS Derivation: L(y | \theta) = -\frac{n}{2}log(2\pi\sigma^2) - \frac{1}{2\pi}\sum_{i=1}^n \epsilon_t^2 \\ \sigma^2 = \frac{\sum_{i = 1}^n \epsilon_t^2}{n} \\ \frac{\partial L}{\partial \theta} = -\frac{1}{\sigma^2} \sum_{i = 1}^n \epsilon_t \frac{\partial \epsilon_t}{\partial \theta} \\ \frac{\partial \epsilon_t}{\partial \theta} = -\frac{\partial \hat{y}}{\partial \theta} \\ \frac{\partial\hat{y}}{\partial c} = 1 + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial c} \\ \frac{\partial\hat{y}}{\partial \theta_{ar_i}} = y_{t - i} + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ar_i}} \\ \frac{\partial\hat{y}}{\partial \theta_{ma_i}} = \epsilon_{t - i} + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ma_i}} \\ Parameters ---------- diffedY: array of differenced values returns the gradient log likelihood as an array of double """ # need to copy diffedy to a double[] for Java result = self._jmodel.gradientlogLikelihoodCSSARMA( _py2java_double_array(diffedy, self._ctx._gateway)) return _java2py(self._ctx, result)
def _call_java(sc, java_obj, name, *args): """ Method copied from pyspark.ml.wrapper. Uses private Spark APIs. """ m = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def forecast(self, ts, nfuture): """ Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current model parameters, and then provide `nFuture` periods of forecast. We assume AR terms prior to the start of the series are equal to the model's intercept term (or 0.0, if fit without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If there is differencing, the first d terms come from the original series. Parameters ---------- ts: Timeseries to use as gold-standard. Each value (i) in the returning series is a 1-step ahead forecast of ts(i). We use the difference between ts(i) - estimate(i) to calculate the error at time i, which is used for the moving average terms. nFuture: Periods in the future to forecast (beyond length of ts) Returns a series consisting of fitted 1-step ahead forecasts for historicals and then `nFuture` periods of forecasts. Note that in the future values error terms become zero and prior predictions are used for any AR terms. """ jts = _py2java(self._ctx, ts) jfore = self._jmodel.forecast(jts, nfuture) return _java2py(self._ctx, jfore)
def check_params(self, py_stage): if not hasattr(py_stage, "_to_java"): return java_stage = py_stage._to_java() if java_stage is None: return for p in py_stage.params: java_param = java_stage.getParam(p.name) py_has_default = py_stage.hasDefault(p) java_has_default = java_stage.hasDefault(java_param) self.assertEqual( py_has_default, java_has_default, "Default value mismatch of param %s for Params %s" % (p.name, str(py_stage))) if py_has_default: if p.name == "seed": return # Random seeds between Spark and PySpark are different java_default =\ _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param)) py_stage._clear(p) py_default = py_stage.getOrDefault(p) self.assertEqual( java_default, py_default, "Java default %s != python default %s of param %s for Params %s" % (str(java_default), str(py_default), p.name, str(py_stage)))
def forecast(self, ts, nfuture): """ Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current model parameters, and then provide `nFuture` periods of forecast. We assume AR terms prior to the start of the series are equal to the model's intercept term (or 0.0, if fit without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If there is differencing, the first d terms come from the original series. Parameters ---------- ts: Timeseries to use as gold-standard. Each value (i) in the returning series is a 1-step ahead forecast of ts(i). We use the difference between ts(i) - estimate(i) to calculate the error at time i, which is used for the moving average terms. Numpy array. nFuture: Periods in the future to forecast (beyond length of ts) Returns a series consisting of fitted 1-step ahead forecasts for historicals and then `nFuture` periods of forecasts. Note that in the future values error terms become zero and prior predictions are used for any AR terms. """ jts = _py2java(self._ctx, Vectors.dense(ts)) jfore = self._jmodel.forecast(jts, nfuture) return _java2py(self._ctx, jfore)
def gradient_log_likelihood_css_arma(self, diffedy): """ Calculates the gradient for the log likelihood function using CSS Derivation: L(y | \theta) = -\frac{n}{2}log(2\pi\sigma^2) - \frac{1}{2\pi}\sum_{i=1}^n \epsilon_t^2 \\ \sigma^2 = \frac{\sum_{i = 1}^n \epsilon_t^2}{n} \\ \frac{\partial L}{\partial \theta} = -\frac{1}{\sigma^2} \sum_{i = 1}^n \epsilon_t \frac{\partial \epsilon_t}{\partial \theta} \\ \frac{\partial \epsilon_t}{\partial \theta} = -\frac{\partial \hat{y}}{\partial \theta} \\ \frac{\partial\hat{y}}{\partial c} = 1 + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial c} \\ \frac{\partial\hat{y}}{\partial \theta_{ar_i}} = y_{t - i} + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ar_i}} \\ \frac{\partial\hat{y}}{\partial \theta_{ma_i}} = \epsilon_{t - i} + \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ma_i}} \\ Parameters ---------- diffedY: array of differenced values returns the gradient log likelihood as an array of double """ # need to copy diffedy to a double[] for Java result = self._jmodel.gradientlogLikelihoodCSSARMA(_py2java_double_array(self._ctx, diffedy)) return _java2py(self._ctx, result)
def checkLoadBalancing(df: DataFrame, kind: str = "frac", numberOfElements: int = -1): """ DataFrame containing the weight of each partition. You can choose between outputing the size (number of rows) of each partition or the fractional size (%) to the total number of rows. size of the dataset (in percent). This is useful to check whether the load is correctly balanced. Parameters ---------- df : DataFrame Input DataFrame kind : str print the load balancing in terms of fractional size (kind="frac") or number of rows per partition (kind="size"). Default is "frac". numberOfElements : int (optional). Total number of elements in the DataFrame. Only needed if you choose to output fractional sizes (kind="frac"). If not provided (i.e. default value of -1) and kind="frac", it will be computed (count). Returns ---------- dfout : DataFrame containing the weight of each partition. Examples ---------- Load data >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/astro_obs.fits") Fake repartitioning in 10 equal sized partitions >>> df = df.repartition(10) Compute the load balancing % >>> df_load = checkLoadBalancing(df, kind="frac") Note that this is a DataFrame, so you can use df.show() Here we will check that the total is indeed 100% >>> val = df_load.select("Load (%)").collect() >>> assert(int(sum([i[0] for i in val])) == 100) Same using number of rows instead of fractional contribution >>> df_load = checkLoadBalancing(df, kind="size") >>> val = df_load.select("Load (#Rows)").collect() >>> assert(int(sum([i[0] for i in val])) == df.count()) """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Checkers.checkLoadBalancing".format(prefix) scalaclass = load_from_jvm(scalapath) dfout = _java2py(get_spark_context(), scalaclass(df._jdf, kind, numberOfElements)) return dfout
def load(cls, sc, path): """Load a LassoModel.""" java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() model = LassoModel(weights, intercept) return model
def log_likelihood(self, ts): """ Returns the log likelihood of the parameters on the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf """ likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, likelihood)
def to_double_rdd(self, column_index): """ Returns a RDD by converting values to double of given column index :param column_index:One column index in TransformableRDD :return:RDD """ rdd = self._transformable_rdd.toDoubleRDD(column_index).rdd() return _java2py(self.spark_context, rdd)
def explodearrayofstruct(df: DataFrame, columnname: str) -> DataFrame: """From a nested column (array of struct), create one column per array element. The routine accesses the JVM under the hood, and calls the Scala routine explodeArrayOfStruct. Make sure you have the fink_broker jar in your classpath. Example: | |-- prv_candidates: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- jd: double (nullable = true) | | | |-- fid: integer (nullable = true) Would become: |-- prv_candidates_jd: array (nullable = true) | |-- element: double (containsNull = true) |-- prv_candidates_fid: array (nullable = true) | |-- element: integer (containsNull = true) Parameters ---------- df : DataFrame Input nested Spark DataFrame columnname : str The name of the column to explode Returns ------- DataFrame Spark DataFrame with new columns from the input column. Examples ------- >>> df = spark.read.format("avro").load(ztf_alert_sample) # Candidate is nested >>> s = df.schema >>> typeOf = {i.name: i.dataType.typeName() for i in s.fields} >>> typeOf['prv_candidates'] == 'array' True # Flatten it >>> df_flat = explodearrayofstruct(df, "prv_candidates") >>> "prv_candidates_ra" in df_flat.schema.fieldNames() True # Each new column contains array element cast to string >>> s_flat = df_flat.schema >>> typeOf = {i.name: i.dataType.typeName() for i in s_flat.fields} >>> typeOf['prv_candidates_ra'] == 'string' True """ sc = get_spark_context() obj = sc._jvm.com.astrolabsoftware.fink_broker.catalogUtils _df = obj.explodeArrayOfStruct(df._jdf, columnname) df_flatten = _java2py(sc, _df) return df_flatten
def multiply_columns(self, first_column, second_column): """ Returns a RDD which is a product of the values in @first_column and @second_column :param first_column: One column index :param second_column: Another column index :return: RDD """ _rdd = self._transformable_rdd.multiplyColumns(first_column, second_column).rdd() return _java2py(self.spark_context, _rdd)
def load(cls, sc: SparkContext, path: str) -> "KMeansModel": """ Load a model from the given path. """ assert sc._jvm is not None java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load( sc._jsc.sc(), path) return KMeansModel(_java2py(sc, java_model.clusterCenters()))
def load(cls, sc: SparkContext, path: str) -> "LassoModel": """Load a LassoModel.""" assert sc._jvm is not None java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load(sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() model = LassoModel(weights, intercept) return model
def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() threshold = java_model.getThreshold().get() model = SVMModel(weights, intercept) model.setThreshold(threshold) return model
def log_likelihood(self, ts): """ Returns the log likelihood of the parameters on the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf """ likelihood = self._jmodel.logLikelihood( _py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, likelihood)
def knn(df: DataFrame, p: list, k: int, coordSys: str, unique: bool): """ Finds the K nearest neighbors of the query object. The naive implementation here searches through all the the objects in the DataFrame to get the KNN. The nearness of the objects here is decided on the basis of the distance between their centers. Parameters ---------- df : DataFrame Input Dataframe. Must have 3 columns corresponding to the coordinate (x, y, z) if cartesian or (r, theta, phi) f spherical. p : list of float Targeted point for which we want neighbors. k : int Number of neighbours coordSys : str Coordinate system: spherical or cartesian unique : bool Boolean. If true, returns only distinct objects. Default is false. Returns -------- out : DataFrame DataFrame with the coordinates of the k neighbours found. Examples -------- >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/cartesian_points.fits") Get the 100 closest neighbours around the point [0.2, 0.2, 0.2] >>> K = 100 >>> target = [0.2, 0.2, 0.2] >>> unique = False >>> neighbours = knn(df.select("x", "y", "z"), target, K, "spherical", unique) >>> print(neighbours.count()) 100 You can add back the metadata >>> neighboursWithMeta = df.join(neighbours, ["x", "y", "z"], "left_semi") """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Queries.KNN".format(prefix) scalaclass = load_from_jvm(scalapath) # # To convert python List to Scala Map convpath = "{}.python.PythonClassTag.javaListtoscalaList".format(prefix) conv = load_from_jvm(convpath) out = _java2py(get_spark_context(), scalaclass(df._jdf, conv(p), k, coordSys, unique)) return out
def _transfer_params_from_java(self): """ Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._paramMap[param] = value
def smooth(self, column_index, smoothing_method): """ Returns a new RDD containing smoothed values of @column_index using @smoothing_method :param column_index: Index of the column :param smoothing_method: smoothing method by which you want to smooth data :return: RDD """ method = smoothing_method._get_smoothing_method(self.spark_context) rdd = self._transformable_rdd.smooth(column_index, method) return _java2py(self.spark_context, rdd.rdd())
def gradient(self, ts): """ Find the gradient of the log likelihood with respect to the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf Returns an 3-element array containing the gradient for the alpha, beta, and omega parameters. """ gradient = self._jmodel.gradient(_py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, gradient)
def flattenstruct(df: DataFrame, columnname: str) -> DataFrame: """ From a nested column (struct of primitives), create one column per struct element. The routine accesses the JVM under the hood, and calls the Scala routine flattenStruct. Make sure you have the fink_broker jar in your classpath. Example: |-- candidate: struct (nullable = true) | |-- jd: double (nullable = true) | |-- fid: integer (nullable = true) Would become: |-- candidate_jd: double (nullable = true) |-- candidate_fid: integer (nullable = true) Parameters ---------- df : DataFrame Nested Spark DataFrame columnname : str The name of the column to flatten. Returns ------- DataFrame Spark DataFrame with new columns from the input column. Examples ------- >>> df = spark.read.format("avro").load(ztf_alert_sample) # Candidate is nested >>> s = df.schema >>> typeOf = {i.name: i.dataType.typeName() for i in s.fields} >>> typeOf['candidate'] == 'struct' True # Flatten it >>> df_flat = flattenstruct(df, "candidate") >>> "candidate_ra" in df_flat.schema.fieldNames() True # Each new column contains array element >>> s_flat = df_flat.schema >>> typeOf = {i.name: i.dataType.typeName() for i in s_flat.fields} >>> typeOf['candidate_ra'] == 'double' True """ sc = get_spark_context() obj = sc._jvm.com.astrolabsoftware.fink_broker.catalogUtils _df = obj.flattenStruct(df._jdf, columnname) df_flatten = _java2py(sc, _df) return df_flatten
def get(self, *outputs): """ Parameters ---------- outputs: string, list of strings Output variables as defined inside the DML script. """ outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs] if len(outs) == 1: return outs[0] return outs
def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() numFeatures = java_model.numFeatures() numClasses = java_model.numClasses() threshold = java_model.getThreshold().get() model = LogisticRegressionModel(weights, intercept, numFeatures, numClasses) model.setThreshold(threshold) return model
def __init__(self, c=0, coefficients=None, jmodel=None, sc=None): assert sc != None, "Missing SparkContext" self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARModel(c, _py2java_double_array(self._ctx, coefficients)) else: self._jmodel = jmodel self.c = self._jmodel.c() self.coefficients = _java2py(self._ctx, self._jmodel.coefficients())
def _transfer_param_map_from_java(self, javaParamMap): """ Transforms a Java ParamMap into a Python ParamMap. """ sc = SparkContext._active_spark_context paramMap = dict() for pair in javaParamMap.toList(): param = pair.param() if self.hasParam(str(param.name())): paramMap[self.getParam(param.name())] = _java2py(sc, pair.value()) return paramMap
def _transfer_params_from_java(self): """ Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. if self._java_obj.isSet(java_param): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value})
def remove_time_dependent_effects(self, ts): """ Given a timeseries, apply inverse operations to obtain the original series of underlying errors. Parameters ---------- ts: Time series of observations with this model's characteristics as a Numpy array returns the time series with removed time-dependent effects as a Numpy array """ destts = Vectors.dense(np.array([0] * len(ts))) result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def sample(self, n): """ Sample a series of size n assuming an ARIMA(p, d, q) process. Parameters ---------- n: size of sample Returns a series reflecting ARIMA(p, d, q) process as a DenseVector """ rg = self._ctx._jvm.org.apache.commons.math3.random.JDKRandomGenerator() return _java2py(self._ctx, self._jmodel.sample(n, rg))
def sample(self, n): """ Samples a random time series of a given length with the properties of the model. Parameters ---------- n: The length of the time series to sample. Returns the sampled time series. """ rg = self._ctx._jvm.org.apache.commons.math3.random.JDKRandomGenerator() return _java2py(self._ctx, self._jmodel.sample(n, rg))
def __init__(self, c=0.0, coefficients=[], yMaxLag=0, xMaxLag=0, includesOriginalX=True, jmodel=None, sc=None): assert sc != None, "Missing SparkContext" self._ctx = sc if jmodel == None: self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARXModel(float(c), _py2java_double_array(self._ctx, coefficients), yMaxLag, xMaxLag, includesOriginalX) else: self._jmodel = jmodel self.c = self._jmodel.c() self.coefficients = _java2py(self._ctx, self._jmodel.coefficients()) self.yMaxLag = self._jmodel.yMaxLag() self.xMaxLag = self._jmodel.xMaxLag()
def add_time_dependent_effects(self, ts): """ Given a timeseries, apply a model to it. Parameters ---------- ts: Time series of i.i.d. observations as a Numpy array returns the time series with added time-dependent effects as a Numpy array. """ destts = Vectors.dense([0] * len(ts)) result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def log_likelihood_css(self, y): """ log likelihood based on conditional sum of squares Source: http://www.nuffield.ox.ac.uk/economics/papers/1997/w6/ma.pdf Parameters ---------- y: time series as a DenseVector returns log likelihood as a double """ likelihood = self._jmodel.logLikelihoodCSS(_py2java(self._ctx, y)) return _java2py(self._ctx, likelihood)