def fixInvalidLengths(F1: Frame, mask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = {'F1': F1, 'mask': mask} params_dict.update(kwargs) vX_0 = Frame(F1.sds_context, '') vX_1 = Matrix(F1.sds_context, '') vX_2 = Matrix(F1.sds_context, '') vX_3 = Matrix(F1.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, ] op = MultiReturn(F1.sds_context, 'fixInvalidLengths', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] return op
def gridSearch(X: Matrix, y: Matrix, train: str, predict: str, params: List, paramValues: List, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param train: Name ft of the train function to call via ft(trainArgs) :param predict: Name fp of the loss function to call via fp((predictArgs,B)) :param numB: Maximum number of parameters in model B (pass the max because the size :param may: parameters like icpt or multi-class classification) :param columnvectors: hyper-parameters in 'params' :param gridSearch: hyper-parameter by name, if :param not: an empty list, the lm parameters are used :param gridSearch: trained models at the end, if :param not: an empty list, list(X, y) is used instead :param cv: flag enabling k-fold cross validation, otherwise training loss :param cvk: if cv=TRUE, specifies the the number of folds, otherwise ignored :param verbose: flag for verbose debug output :return: 'OperationNode' containing returned as a column-major linearized column vector """ params_dict = {'X': X, 'y': y, 'train': train, 'predict': predict, 'params': params, 'paramValues': paramValues} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Frame(X.sds_context, '') output_nodes = [vX_0, vX_1, ] op = MultiReturn(X.sds_context, 'gridSearch', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix, params: List, paramRanges: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param One: hyper parameter, first column specifies min, second column max value. :param verbose: If TRUE print messages are activated :return: 'OperationNode' containing """ params_dict = {'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges} params_dict.update(kwargs) vX_0 = Matrix(X_train.sds_context, '') vX_1 = Frame(X_train.sds_context, '') output_nodes = [vX_0, vX_1, ] op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix, params: Iterable, paramRanges: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges } params_dict.update(kwargs) vX_0 = Matrix(X_train.sds_context, '') vX_1 = Frame(X_train.sds_context, '') output_nodes = [ vX_0, vX_1, ] op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def from_pandas(self, df: pd.DataFrame, *args: Sequence[VALID_INPUT_TYPES], **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Frame: """Generate DAGNode representing frame with data given by a pandas dataframe, which will be sent to SystemDS on need. :param df: the pandas dataframe :param args: unnamed parameters :param kwargs: named parameters :return: A Frame """ unnamed_params = ["'./tmp/{file_name}'"] if len(df.shape) == 2: named_params = {'rows': df.shape[0], 'cols': df.shape[1]} elif len(df.shape) == 1: named_params = {'rows': df.shape[0], 'cols': 1} else: # TODO Support tensors. raise ValueError("Only two dimensional arrays supported") unnamed_params.extend(args) named_params["data_type"] = '"frame"' self._pd_dataframe = df named_params.update(kwargs) return Frame(self, "read", unnamed_params, named_params, local_data=df)
def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode: """ Read an file from disk. Supportted types include: CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc. See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar. """ mdt_filepath = path + ".mtd" if os.path.exists(mdt_filepath): with open(mdt_filepath) as jspec_file: mtd = json.load(jspec_file) kwargs["data_type"] = mtd["data_type"] data_type = kwargs.get("data_type", None) file_format = kwargs.get("format", None) if data_type == "matrix": kwargs["data_type"] = f'"{data_type}"' return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "frame": kwargs["data_type"] = f'"{data_type}"' if isinstance(file_format, str): kwargs["format"] = f'"{kwargs["format"]}"' return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "scalar": kwargs["data_type"] = f'"{data_type}"' output_type = OutputType.from_str(kwargs.get("value_type", None)) kwargs["value_type"] = f'"{output_type.name}"' return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type) elif data_type == "list": # Reading a list have no extra arguments. return List(self, "read", [f'"{path}"']) kwargs["data_type"] = None print("WARNING: Unknown type read please add a mtd file, or specify in arguments") return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param frequency_threshold: Strings that occur above this frequency level will not be corrected :param distance_threshold: Max distance at which strings are considered similar :param is_verbose: Print debug information :return: 'OperationNode' containing """ params_dict = {'strings': strings} params_dict.update(kwargs) vX_0 = Frame(strings.sds_context, '') vX_1 = Scalar(strings.sds_context, '') vX_2 = Scalar(strings.sds_context, '') vX_3 = Matrix(strings.sds_context, '') vX_4 = Frame(strings.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(strings.sds_context, 'correctTypos', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def mice(X: Matrix, cMask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param iter: Number of iteration for multiple imputations :param threshold: confidence value [0, 1] for robust imputation, values will only be imputed :param if: value has probability greater than threshold, :param only: categorical data :param verbose: Boolean value. :return: 'OperationNode' containing are represented with empty string i.e ",," in csv file & n are storing continuos/numeric data and variables with & storing categorical data """ params_dict = {'X': X, 'cMask': cMask} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') vX_3 = Frame(X.sds_context, '') vX_4 = List(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(X.sds_context, 'mice', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def bandit(X_train: Matrix, Y_train: Matrix, X_test: Matrix, Y_test: Matrix, metaList: List, evaluationFunc: str, evalFunHp: Matrix, lp: Frame, lpHp: Matrix, primitives: Frame, param: Frame, baseLineScore: float, cv: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test, 'metaList': metaList, 'evaluationFunc': evaluationFunc, 'evalFunHp': evalFunHp, 'lp': lp, 'lpHp': lpHp, 'primitives': primitives, 'param': param, 'baseLineScore': baseLineScore, 'cv': cv } params_dict.update(kwargs) vX_0 = Frame(X_train.sds_context, '') vX_1 = Matrix(X_train.sds_context, '') vX_2 = Matrix(X_train.sds_context, '') vX_3 = Frame(X_train.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, ] op = MultiReturn(X_train.sds_context, 'bandit', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] return op
def topk_cleaning(dataTrain: Frame, primitives: Frame, parameters: Frame, evaluationFunc: str, evalFunHp: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'dataTrain': dataTrain, 'primitives': primitives, 'parameters': parameters, 'evaluationFunc': evaluationFunc, 'evalFunHp': evalFunHp } params_dict.update(kwargs) vX_0 = Frame(dataTrain.sds_context, '') vX_1 = Matrix(dataTrain.sds_context, '') vX_2 = Matrix(dataTrain.sds_context, '') vX_3 = Scalar(dataTrain.sds_context, '') vX_4 = Matrix(dataTrain.sds_context, '') vX_5 = Frame(dataTrain.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, ] op = MultiReturn(dataTrain.sds_context, 'topk_cleaning', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] return op
def bandit(X_train: Matrix, Y_train: Matrix, metaList: Iterable, targetList: Iterable, lp: Frame, primitives: Frame, param: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'X_train': X_train, 'Y_train': Y_train, 'metaList': metaList, 'targetList': targetList, 'lp': lp, 'primitives': primitives, 'param': param } params_dict.update(kwargs) vX_0 = Frame(X_train.sds_context, '') vX_1 = Matrix(X_train.sds_context, '') vX_2 = Matrix(X_train.sds_context, '') vX_3 = Frame(X_train.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, ] op = MultiReturn(X_train.sds_context, 'bandit', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] return op
def as_frame(self) -> Frame: ent = self._list_source[self._key] res = Frame(self.sds_context, "as.frame", [ent]) self._list_source._outputs[self._key] = res return res