예제 #1
0
def fixInvalidLengths(F1: Frame, mask: Matrix,
                      **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {'F1': F1, 'mask': mask}
    params_dict.update(kwargs)

    vX_0 = Frame(F1.sds_context, '')
    vX_1 = Matrix(F1.sds_context, '')
    vX_2 = Matrix(F1.sds_context, '')
    vX_3 = Matrix(F1.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
    ]

    op = MultiReturn(F1.sds_context,
                     'fixInvalidLengths',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]

    return op
예제 #2
0
def gridSearch(X: Matrix,
               y: Matrix,
               train: str,
               predict: str,
               params: List,
               paramValues: List,
               **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param train: Name ft of the train function to call via ft(trainArgs)
    :param predict: Name fp of the loss function to call via fp((predictArgs,B))
    :param numB: Maximum number of parameters in model B (pass the max because the size
    :param may: parameters like icpt or multi-class classification)
    :param columnvectors: hyper-parameters in 'params'
    :param gridSearch: hyper-parameter by name, if
    :param not: an empty list, the lm parameters are used
    :param gridSearch: trained models at the end, if
    :param not: an empty list, list(X, y) is used instead
    :param cv: flag enabling k-fold cross validation, otherwise training loss
    :param cvk: if cv=TRUE, specifies the the number of folds, otherwise ignored
    :param verbose: flag for verbose debug output
    :return: 'OperationNode' containing returned as a column-major linearized column vector 
    """
    params_dict = {'X': X, 'y': y, 'train': train, 'predict': predict, 'params': params, 'paramValues': paramValues}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Frame(X.sds_context, '')
    output_nodes = [vX_0, vX_1, ]

    op = MultiReturn(X.sds_context, 'gridSearch', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
예제 #3
0
def hyperband(X_train: Matrix,
              y_train: Matrix,
              X_val: Matrix,
              y_val: Matrix,
              params: List,
              paramRanges: Matrix,
              **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param One: hyper parameter, first column specifies min, second column max value.
    :param verbose: If TRUE print messages are activated
    :return: 'OperationNode' containing  
    """
    params_dict = {'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X_train.sds_context, '')
    vX_1 = Frame(X_train.sds_context, '')
    output_nodes = [vX_0, vX_1, ]

    op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
예제 #4
0
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix,
              params: Iterable, paramRanges: Matrix,
              **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'X_train': X_train,
        'y_train': y_train,
        'X_val': X_val,
        'y_val': y_val,
        'params': params,
        'paramRanges': paramRanges
    }
    params_dict.update(kwargs)

    vX_0 = Matrix(X_train.sds_context, '')
    vX_1 = Frame(X_train.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
    ]

    op = MultiReturn(X_train.sds_context,
                     'hyperband',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
예제 #5
0
    def from_pandas(self, df: pd.DataFrame,
                    *args: Sequence[VALID_INPUT_TYPES], **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Frame:
        """Generate DAGNode representing frame with data given by a pandas dataframe, which will be sent to SystemDS
        on need.

        :param df: the pandas dataframe
        :param args: unnamed parameters
        :param kwargs: named parameters
        :return: A Frame
        """
        unnamed_params = ["'./tmp/{file_name}'"]

        if len(df.shape) == 2:
            named_params = {'rows': df.shape[0], 'cols': df.shape[1]}
        elif len(df.shape) == 1:
            named_params = {'rows': df.shape[0], 'cols': 1}
        else:
            # TODO Support tensors.
            raise ValueError("Only two dimensional arrays supported")

        unnamed_params.extend(args)
        named_params["data_type"] = '"frame"'

        self._pd_dataframe = df

        named_params.update(kwargs)
        return Frame(self, "read", unnamed_params, named_params, local_data=df)
예제 #6
0
    def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
        """ Read an file from disk. Supportted types include:
        CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc.
        See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details
        :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar.
        """
        mdt_filepath = path + ".mtd"
        if os.path.exists(mdt_filepath):
            with open(mdt_filepath) as jspec_file:
                mtd = json.load(jspec_file)
                kwargs["data_type"] = mtd["data_type"]

        data_type = kwargs.get("data_type", None)
        file_format = kwargs.get("format", None)
        if data_type == "matrix":
            kwargs["data_type"] = f'"{data_type}"'
            return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "frame":
            kwargs["data_type"] = f'"{data_type}"'
            if isinstance(file_format, str):
                kwargs["format"] = f'"{kwargs["format"]}"'
            return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "scalar":
            kwargs["data_type"] = f'"{data_type}"'
            output_type = OutputType.from_str(kwargs.get("value_type", None))
            kwargs["value_type"] = f'"{output_type.name}"'
            return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type)
        elif data_type == "list":
            # Reading a list have no extra arguments.
            return List(self, "read", [f'"{path}"'])

        kwargs["data_type"] = None
        print("WARNING: Unknown type read please add a mtd file, or specify in arguments")
        return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
예제 #7
0
def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param frequency_threshold: Strings that occur above this frequency level will not be corrected
    :param distance_threshold: Max distance at which strings are considered similar
    :param is_verbose: Print debug information
    :return: 'OperationNode' containing  
    """
    params_dict = {'strings': strings}
    params_dict.update(kwargs)

    vX_0 = Frame(strings.sds_context, '')
    vX_1 = Scalar(strings.sds_context, '')
    vX_2 = Scalar(strings.sds_context, '')
    vX_3 = Matrix(strings.sds_context, '')
    vX_4 = Frame(strings.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(strings.sds_context,
                     'correctTypos',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
예제 #8
0
파일: mice.py 프로젝트: A-Postl/systemds
def mice(X: Matrix, cMask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param iter: Number of iteration for multiple imputations
    :param threshold: confidence value [0, 1] for robust imputation, values will only be imputed
    :param if: value has probability greater than threshold,
    :param only: categorical data
    :param verbose: Boolean value.
    :return: 'OperationNode' containing are represented with empty string i.e ",," in csv file   & n are storing continuos/numeric data and variables with  & storing categorical data 
    """
    params_dict = {'X': X, 'cMask': cMask}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Frame(X.sds_context, '')
    vX_4 = List(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'mice',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
예제 #9
0
def bandit(X_train: Matrix, Y_train: Matrix, X_test: Matrix, Y_test: Matrix,
           metaList: List, evaluationFunc: str, evalFunHp: Matrix, lp: Frame,
           lpHp: Matrix, primitives: Frame, param: Frame, baseLineScore: float,
           cv: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'X_train': X_train,
        'Y_train': Y_train,
        'X_test': X_test,
        'Y_test': Y_test,
        'metaList': metaList,
        'evaluationFunc': evaluationFunc,
        'evalFunHp': evalFunHp,
        'lp': lp,
        'lpHp': lpHp,
        'primitives': primitives,
        'param': param,
        'baseLineScore': baseLineScore,
        'cv': cv
    }
    params_dict.update(kwargs)

    vX_0 = Frame(X_train.sds_context, '')
    vX_1 = Matrix(X_train.sds_context, '')
    vX_2 = Matrix(X_train.sds_context, '')
    vX_3 = Frame(X_train.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
    ]

    op = MultiReturn(X_train.sds_context,
                     'bandit',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]

    return op
예제 #10
0
def topk_cleaning(dataTrain: Frame, primitives: Frame, parameters: Frame,
                  evaluationFunc: str, evalFunHp: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'dataTrain': dataTrain,
        'primitives': primitives,
        'parameters': parameters,
        'evaluationFunc': evaluationFunc,
        'evalFunHp': evalFunHp
    }
    params_dict.update(kwargs)

    vX_0 = Frame(dataTrain.sds_context, '')
    vX_1 = Matrix(dataTrain.sds_context, '')
    vX_2 = Matrix(dataTrain.sds_context, '')
    vX_3 = Scalar(dataTrain.sds_context, '')
    vX_4 = Matrix(dataTrain.sds_context, '')
    vX_5 = Frame(dataTrain.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
    ]

    op = MultiReturn(dataTrain.sds_context,
                     'topk_cleaning',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]

    return op
예제 #11
0
def bandit(X_train: Matrix, Y_train: Matrix, metaList: Iterable,
           targetList: Iterable, lp: Frame, primitives: Frame, param: Frame,
           **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'X_train': X_train,
        'Y_train': Y_train,
        'metaList': metaList,
        'targetList': targetList,
        'lp': lp,
        'primitives': primitives,
        'param': param
    }
    params_dict.update(kwargs)

    vX_0 = Frame(X_train.sds_context, '')
    vX_1 = Matrix(X_train.sds_context, '')
    vX_2 = Matrix(X_train.sds_context, '')
    vX_3 = Frame(X_train.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
    ]

    op = MultiReturn(X_train.sds_context,
                     'bandit',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]

    return op
예제 #12
0
 def as_frame(self) -> Frame:
     ent = self._list_source[self._key]
     res = Frame(self.sds_context, "as.frame", [ent])
     self._list_source._outputs[self._key] = res
     return res