Пример #1
0
def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param frequency_threshold: Strings that occur above this frequency level will not be corrected
    :param distance_threshold: Max distance at which strings are considered similar
    :param is_verbose: Print debug information
    :return: 'OperationNode' containing  
    """
    params_dict = {'strings': strings}
    params_dict.update(kwargs)

    vX_0 = Frame(strings.sds_context, '')
    vX_1 = Scalar(strings.sds_context, '')
    vX_2 = Scalar(strings.sds_context, '')
    vX_3 = Matrix(strings.sds_context, '')
    vX_4 = Frame(strings.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(strings.sds_context,
                     'correctTypos',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Пример #2
0
def executePipeline(pipeline: Frame, Xtrain: Matrix, Ytrain: Matrix,
                    Xtest: Matrix, Ytest: Matrix, metaList: List,
                    hyperParameters: Matrix, flagsCount: int, verbose: bool,
                    **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param flagsCount: ---
    :param test: ---
    :return: 'OperationNode' containing  
    """
    params_dict = {
        'pipeline': pipeline,
        'Xtrain': Xtrain,
        'Ytrain': Ytrain,
        'Xtest': Xtest,
        'Ytest': Ytest,
        'metaList': metaList,
        'hyperParameters': hyperParameters,
        'flagsCount': flagsCount,
        'verbose': verbose
    }
    params_dict.update(kwargs)

    vX_0 = Matrix(pipeline.sds_context, '')
    vX_1 = Matrix(pipeline.sds_context, '')
    vX_2 = Matrix(pipeline.sds_context, '')
    vX_3 = Matrix(pipeline.sds_context, '')
    vX_4 = Scalar(pipeline.sds_context, '')
    vX_5 = Matrix(pipeline.sds_context, '')
    vX_6 = Matrix(pipeline.sds_context, '')
    vX_7 = Scalar(pipeline.sds_context, '')
    vX_8 = List(pipeline.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
        vX_7,
        vX_8,
    ]

    op = MultiReturn(pipeline.sds_context,
                     'executePipeline',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]
    vX_7._unnamed_input_nodes = [op]
    vX_8._unnamed_input_nodes = [op]

    return op
Пример #3
0
def garch(X: Matrix, kmax: int, momentum: float, start_stepsize: float,
          end_stepsize: float, start_vicinity: float, end_vicinity: float,
          sim_seed: int, verbose: bool):
    """
    :param X: The input Matrix to apply Arima on.
    :param kmax: Number of iterations
    :param momentum: Momentum for momentum-gradient descent (set to 0 to deactivate)
    :param start_stepsize: Initial gradient-descent stepsize
    :param end_stepsize: gradient-descent stepsize at end (linear descent)
    :param start_vicinity: proportion of randomness of restart-location for gradient descent at beginning
    :param end_vicinity: same at end (linear decay)
    :param sim_seed: seed for simulation of process on fitted coefficients
    :param verbose: verbosity, comments during fitting
    :return: 'OperationNode' containing simulated garch(1,1) process on fitted coefficients & variances of simulated fitted process & constant term of fitted process & 1-st arch-coefficient of fitted process & 1-st garch-coefficient of fitted process & drawbacks: slow convergence of optimization (sort of simulated annealing/gradient descent) 
    """
    params_dict = {
        'X': X,
        'kmax': kmax,
        'momentum': momentum,
        'start_stepsize': start_stepsize,
        'end_stepsize': end_stepsize,
        'start_vicinity': start_vicinity,
        'end_vicinity': end_vicinity,
        'sim_seed': sim_seed,
        'verbose': verbose
    }

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'garch',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Пример #4
0
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix,
                       **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param X: Data Matrix X
    :param B: Regression parameters betas
    :param Y: Response vector Y
    :param verbose: /
    :return: 'OperationNode' containing matrix m of predicted means/probabilities & predicted response vector & scalar value of accuracy 
    """
    params_dict = {'X': X, 'B': B, 'Y': Y}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'multiLogRegPredict',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Пример #5
0
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix,
                       **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param verbose: flag specifying if logging information should be printed
    :return: 'OperationNode' containing value of accuracy 
    """
    params_dict = {'X': X, 'B': B, 'Y': Y}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'multiLogRegPredict',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Пример #6
0
def gmm(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param n_components: Number of n_components in the Gaussian mixture model
    :param model: "VVV": unequal variance (full),each component has its own general covariance matrix
    :param init_param: initialize weights with "kmeans" or "random"
    :param iterations: Number of iterations
    :param reg_covar: regularization parameter for covariance matrix
    :param tol: tolerance value for convergence
    :return: 'OperationNode' containing of estimated parameters & information criterion for best iteration & kth class 
    """
    params_dict = {'X': X}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Matrix(X.sds_context, '')
    vX_5 = Matrix(X.sds_context, '')
    vX_6 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
    ]

    op = MultiReturn(X.sds_context,
                     'gmm',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]

    return op
Пример #7
0
def outlierByIQR(X: Matrix, k: float, max_iterations: int,
                 **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param k: a constant used to discern outliers k*IQR
    :param isIterative: iterative repair or single repair
    :param repairMethod: values: 0 = delete rows having outliers,
    :param max_iterations: values: 0 = arbitrary number of iteraition until all outliers are removed,
    :param verbose: flag specifying if logging information should be printed
    :return: 'OperationNode' containing meaning & matrix x with no outliers 
    """
    params_dict = {'X': X, 'k': k, 'max_iterations': max_iterations}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    vX_3 = Matrix(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    vX_5 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
    ]

    op = MultiReturn(X.sds_context,
                     'outlierByIQR',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]

    return op
Пример #8
0
def gmm(X: Matrix, verbose: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {'X': X, 'verbose': verbose}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Matrix(X.sds_context, '')
    vX_5 = Matrix(X.sds_context, '')
    vX_6 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
    ]

    op = MultiReturn(X.sds_context,
                     'gmm',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]

    return op
Пример #9
0
def outlierBySd(X: Matrix, max_iterations: int,
                **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param k: threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
    :param repairMethod: values: 0 = delete rows having outliers, 1 = replace outliers as  zeros
    :param max_iterations: values: 0 = arbitrary number of iteration until all outliers are removed,
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'max_iterations': max_iterations}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'outlierBySd',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Пример #10
0
def topk_cleaning(dataTrain: Frame, primitives: Frame, parameters: Frame,
                  evaluationFunc: str, evalFunHp: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'dataTrain': dataTrain,
        'primitives': primitives,
        'parameters': parameters,
        'evaluationFunc': evaluationFunc,
        'evalFunHp': evalFunHp
    }
    params_dict.update(kwargs)

    vX_0 = Frame(dataTrain.sds_context, '')
    vX_1 = Matrix(dataTrain.sds_context, '')
    vX_2 = Matrix(dataTrain.sds_context, '')
    vX_3 = Scalar(dataTrain.sds_context, '')
    vX_4 = Matrix(dataTrain.sds_context, '')
    vX_5 = Frame(dataTrain.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
    ]

    op = MultiReturn(dataTrain.sds_context,
                     'topk_cleaning',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]

    return op
Пример #11
0
def dbscan(X: Matrix,
           **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param eps: Maximum distance between two points for one to be considered reachable for the other.
    :param minPts: Number of points in a neighborhood for a point to be considered as a core point
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [vX_0, vX_1, vX_2, ]

    op = MultiReturn(X.sds_context, 'dbscan', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Пример #12
0
def mice(X: Matrix, cMask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param iter: Number of iteration for multiple imputations
    :param threshold: confidence value [0, 1] for robust imputation, values will only be imputed
    :param if: value has probability greater than threshold,
    :param only: categorical data
    :param verbose: Boolean value.
    :return: 'OperationNode' containing are represented with empty string i.e ",," in csv file   & n are storing continuos/numeric data and variables with  & storing categorical data 
    """
    params_dict = {'X': X, 'cMask': cMask}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Frame(X.sds_context, '')
    vX_4 = List(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'mice',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op