def save(self, sc, path): """Save a IsotonicRegressionModel.""" java_boundaries = _py2java(sc, self.boundaries.tolist()) java_predictions = _py2java(sc, self.predictions.tolist()) java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel( java_boundaries, java_predictions, self.isotonic) java_model.save(sc._jsc.sc(), path)
def save(self, sc, path): java_labels = _py2java(sc, self.labels.tolist()) java_pi = _py2java(sc, self.pi.tolist()) java_theta = _py2java(sc, self.theta.tolist()) java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel( java_labels, java_pi, java_theta) java_model.save(sc._jsc.sc(), path)
def save(self, sc, path): """ Save this model to the given path. """ java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers]) java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers) java_model.save(sc._jsc.sc(), path)
def forecast(self, ts, nfuture): """ Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current model parameters, and then provide `nFuture` periods of forecast. We assume AR terms prior to the start of the series are equal to the model's intercept term (or 0.0, if fit without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If there is differencing, the first d terms come from the original series. Parameters ---------- ts: Timeseries to use as gold-standard. Each value (i) in the returning series is a 1-step ahead forecast of ts(i). We use the difference between ts(i) - estimate(i) to calculate the error at time i, which is used for the moving average terms. Numpy array. nFuture: Periods in the future to forecast (beyond length of ts) Returns a series consisting of fitted 1-step ahead forecasts for historicals and then `nFuture` periods of forecasts. Note that in the future values error terms become zero and prior predictions are used for any AR terms. """ jts = _py2java(self._ctx, Vectors.dense(ts)) jfore = self._jmodel.forecast(jts, nfuture) return _java2py(self._ctx, jfore)
def save(self, sc, path): """ Save this model to the given path. """ java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel( _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses) java_model.save(sc._jsc.sc(), path)
def save(self, sc, path): """ Save this model to the given path. """ java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel( _py2java(sc, self._coeff), self.intercept) java_model.save(sc._jsc.sc(), path)
def _call_java(sc, java_obj, name, *args): """ Method copied from pyspark.ml.wrapper. Uses private Spark APIs. """ m = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def autofit(ts, maxp=5, maxd=2, maxq=5, sc=None): """ Utility function to help in fitting an automatically selected ARIMA model based on approximate Akaike Information Criterion (AIC) values. The model search is based on the heuristic developed by Hyndman and Khandakar (2008) and described in [[http://www.jstatsoft .org/v27/i03/paper]]. In contrast to the algorithm in the paper, we use an approximation to the AIC, rather than an exact value. Note that if the maximum differencing order provided does not suffice to induce stationarity, the function returns a failure, with the appropriate message. Additionally, note that the heuristic only considers models that have parameters satisfying the stationarity/invertibility constraints. Finally, note that our algorithm is slightly more lenient than the original heuristic. For example, the original heuristic rejects models with parameters "close" to violating stationarity/invertibility. We only reject those that actually violate it. This functionality is even less mature than some of the other model fitting functions here, so use it with caution. Parameters ---------- ts: time series to which to automatically fit an ARIMA model maxP: limit for the AR order maxD: limit for differencing order maxQ: limit for the MA order sc: The SparkContext, required. returns an ARIMAModel """ jmodel = sc._jvm.com.cloudera.sparkts.models.ARIMA.autoFit(_py2java(sc, ts), maxp, maxd, maxq) return ARIMAModel(jmodel=jmodel, sc=sc)
def perform_pca(matrix, row_count, nr_principal_components=2): """Return principal components of the input matrix. This function uses MLlib's ``RowMatrix`` to compute principal components. Args: matrix: An RDD[int, (int, float)] representing a sparse matrix. This is returned by ``center_matrix`` but it is not required to center the matrix first. row_count: The size (N) of the N x N ``matrix``. nr_principal_components: Number of components we want to obtain. This value must be less than or equal to the number of rows in the input square matrix. Returns: An array of ``nr_principal_components`` columns, and same number of rows as the input ``matrix``. This array is a ``numpy`` array. """ py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row)) sc = pyspark.SparkContext._active_spark_context java_rdd = mllib_common._py2java(sc, py_rdd) scala_rdd = java_rdd.rdd() sc = pyspark.SparkContext._active_spark_context row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed. RowMatrix(scala_rdd) ) pca = row_matrix.computePrincipalComponents(nr_principal_components) pca = mllib_common._java2py(sc, pca) return pca.toArray()
def log_likelihood(self, ts): """ Returns the log likelihood of the parameters on the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf """ likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, likelihood)
def _make_java_param_pair(self, param, value): """ Makes a Java parm pair. """ sc = SparkContext._active_spark_context param = self._resolveParam(param) java_param = self._java_obj.getParam(param.name) java_value = _py2java(sc, value) return java_param.w(java_value)
def add_time_dependent_effects(self, ts, destts): """ Given a timeseries, apply an ARIMA(p, d, q) model to it. We assume that prior MA terms are 0.0 and prior AR terms are equal to the intercept or 0.0 if fit without an intercept Parameters ---------- ts: Time series of i.i.d. observations as a DenseVector destts: Time series with added time-dependent effects as a DenseVector. returns the dest series, representing the application of the model to provided error terms, for convenience. """ result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, ts), _py2java(self._ctx, destts)) return _java2py(self._ctx, result)
def remove_time_dependent_effects(self, ts, destts): """ Given a timeseries, assume that it is the result of an ARIMA(p, d, q) process, and apply inverse operations to obtain the original series of underlying errors. To do so, we assume prior MA terms are 0.0, and prior AR are equal to the model's intercept or 0.0 if fit without an intercept Parameters ---------- ts: Time series of observations with this model's characteristics as a DenseVector destts: Time series with removed time-dependent effects as a DenseVector. returns The dest series, representing remaining errors, for convenience. """ result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, ts), _py2java(self._ctx, destts)) return _java2py(self._ctx, result)
def _new_java_obj(sc, java_class, *args): """ Construct a new Java object. """ java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def gradient(self, ts): """ Find the gradient of the log likelihood with respect to the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf Returns an 3-element array containing the gradient for the alpha, beta, and omega parameters. """ gradient = self._jmodel.gradient(_py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, gradient)
def _new_java_obj(java_class, *args): """ Returns a new Java object. """ sc = SparkContext._active_spark_context java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def remove_time_dependent_effects(self, ts): """ Given a timeseries, apply inverse operations to obtain the original series of underlying errors. Parameters ---------- ts: Time series of observations with this model's characteristics as a Numpy array returns the time series with removed time-dependent effects as a Numpy array """ destts = Vectors.dense(np.array([0] * len(ts))) result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def add_time_dependent_effects(self, ts): """ Given a timeseries, apply a model to it. Parameters ---------- ts: Time series of i.i.d. observations as a Numpy array returns the time series with added time-dependent effects as a Numpy array. """ destts = Vectors.dense([0] * len(ts)) result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def log_likelihood_css(self, y): """ log likelihood based on conditional sum of squares Source: http://www.nuffield.ox.ac.uk/economics/papers/1997/w6/ma.pdf Parameters ---------- y: time series as a DenseVector returns log likelihood as a double """ likelihood = self._jmodel.logLikelihoodCSS(_py2java(self._ctx, y)) return _java2py(self._ctx, likelihood)
def approx_aic(self, ts): """ Calculates an approximation to the Akaike Information Criterion (AIC). This is an approximation as we use the conditional likelihood, rather than the exact likelihood. Please see [[https://en.wikipedia.org/wiki/Akaike_information_criterion]] for more information on this measure. Parameters ---------- ts: the timeseries to evaluate under current model Returns an approximation to the AIC under the current model as a double """ return self._jmodel.approxAIC(_py2java(self._ctx, Vectors.dense(ts)))
def fit_model(ts, sc=None): """ Fits an AR(1) + GARCH(1, 1) model to the given time series. Parameters ---------- ts: the time series to which we want to fit a AR+GARCH model as a Numpy array Returns an ARGARCH model """ assert sc != None, "Missing SparkContext" jvm = sc._jvm jmodel = jvm.com.cloudera.sparkts.models.ARGARCH.fitModel(_py2java(sc, Vectors.dense(ts))) return ARGARCHModel(jmodel=jmodel, sc=sc)
def fit_model(ts, maxLag=1, noIntercept=False, sc=None): """ Fits an AR(1) model to the given time series Parameters ---------- ts: the time series to which we want to fit an autoregression model as a Numpy array Returns an ARModel """ assert sc != None, "Missing SparkContext" jvm = sc._jvm jmodel = jvm.com.cloudera.sparkts.models.Autoregression.fitModel(_py2java(sc, Vectors.dense(ts)), maxLag, noIntercept) return ARModel(jmodel=jmodel, sc=sc)
def fit_model(p, d, q, ts, includeIntercept=True, method="css-cgd", userInitParams=None, sc=None): """ Given a time series, fit a non-seasonal ARIMA model of order (p, d, q), where p represents the autoregression terms, d represents the order of differencing, and q moving average error terms. If includeIntercept is true, the model is fitted with an intercept. In order to select the appropriate order of the model, users are advised to inspect ACF and PACF plots, or compare the values of the objective function. Finally, while the current implementation of `fitModel` verifies that parameters fit stationarity and invertibility requirements, there is currently no function to transform them if they do not. It is up to the user to make these changes as appropriate (or select a different model specification) Parameters ---------- p: autoregressive order d: differencing order q: moving average order ts: time series to which to fit an ARIMA(p, d, q) model as a Numpy array. includeIntercept: if true the model is fit with an intercept term. Default is true method: objective function and optimization method, current options are 'css-bobyqa', and 'css-cgd'. Both optimize the log likelihood in terms of the conditional sum of squares. The first uses BOBYQA for optimization, while the second uses conjugate gradient descent. Default is 'css-cgd' userInitParams: A set of user provided initial parameters for optimization as a float list. If null (default), initialized using Hannan-Rissanen algorithm. If provided, order of parameter should be: intercept term, AR parameters (in increasing order of lag), MA parameters (in increasing order of lag). sc: The SparkContext, required. returns an ARIMAModel """ assert sc != None, "Missing SparkContext" jvm = sc._jvm jmodel = jvm.com.cloudera.sparkts.models.ARIMA.fitModel(p, d, q, _py2java(sc, Vectors.dense(ts)), includeIntercept, method, _py2java_double_array(sc, userInitParams)) return ARIMAModel(jmodel=jmodel, sc=sc)
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def execute(self, script): """ Execute a DML / PyDML script. Parameters ---------- script: Script instance Script instance defined with the appropriate input and output variables. Returns ------- ml_results: MLResults MLResults instance. """ if not isinstance(script, Script): raise ValueError("Expected script to be an instance of Script") scriptString = script.scriptString if script.scriptType == "dml": if scriptString.endswith(".dml"): if os.path.exists(scriptString): script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString) else: raise ValueError("path: %s does not exist" % scriptString) else: script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString) elif script.scriptType == "pydml": if scriptString.endswith(".pydml"): if os.path.exists(scriptString): script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString) else: raise ValueError("path: %s does not exist" % scriptString) else: script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString) for key, val in script._input.items(): script_java.input(key, _py2java(self._sc, val)) for val in script._output: script_java.out(val) return MLResults(self._ml.execute(script_java), self._sc)
def fit_model(ts, sc=None): """ Fits an EWMA model to a time series. Uses the first point in the time series as a starting value. Uses sum squared error as an objective function to optimize to find smoothing parameter The model for EWMA is recursively defined as S_t = (1 - a) * X_t + a * S_{t-1}, where a is the smoothing parameter, X is the original series, and S is the smoothed series Note that the optimization is performed as unbounded optimization, although in its formal definition the smoothing parameter is <= 1, which corresponds to an inequality bounded optimization. Given this, the resulting smoothing parameter should always be sanity checked https://en.wikipedia.org/wiki/Exponential_smoothing Parameters ---------- ts: the time series to which we want to fit an EWMA model as a Numpy array Returns an EWMA model """ assert sc != None, "Missing SparkContext" jvm = sc._jvm jmodel = jvm.com.cloudera.sparkts.models.EWMA.fitModel(_py2java(sc, Vectors.dense(ts))) return EWMAModel(jmodel=jmodel, sc=sc)
def save(self, sc, path): java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel( _py2java(sc, self._coeff), self.intercept) java_model.save(sc._jsc.sc(), path)
def py2java(x): sc = SparkContext._active_spark_context return _py2java(sc, x)
def forecast(self, ts, ts1): jts = _py2java(self._ctx, Vectors.dense(ts)) jts1 = _py2java(self._ctx, Vectors.dense(ts1)) jfore = self._jmodel.forecast(jts, jts1) return _java2py(self._ctx, jfore)
def save(self, sc, path): """Save a LassoModel.""" java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel( _py2java(sc, self._coeff), self.intercept) java_model.save(sc._jsc.sc(), path)
def _call_java(self, name, *args): m = getattr(self._java_obj, name) sc = SparkContext._active_spark_context java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def save(self, sc, path): java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel( _py2java(sc, self._coeff), self.intercept) java_model.save(sc._jsc.sc(), path)