Пример #1
0
    def create_exprs(_input_col, _buckets, _func):
        def count_exprs(_exprs):
            return F.sum(F.when(_exprs, 1).otherwise(0))

        _exprs = []
        for i, b in enumerate(_buckets):
            lower = b["lower"]
            upper = b["upper"]

            if is_numeric(lower):
                lower = round(lower, 2)

            if is_numeric(upper):
                upper = round(upper, 2)

            if len(_buckets) == 1:
                count = count_exprs((_func(_input_col) == lower))
            else:
                if i == len(_buckets):
                    count = count_exprs((_func(_input_col) > lower)
                                        & (_func(_input_col) <= upper))
                else:
                    count = count_exprs((_func(_input_col) >= lower)
                                        & (_func(_input_col) < upper))

            info = F.create_map(F.lit("count"), count.cast("int"),
                                F.lit("lower"), F.lit(lower), F.lit("upper"),
                                F.lit(upper)).alias("hist_agg" + "_" +
                                                    _input_col + "_" +
                                                    str(b["bucket"]))
            _exprs.append(info)
        _exprs = F.array(*_exprs).alias("hist" + _input_col)
        return _exprs
Пример #2
0
    def __init__(self, df, col_name, threshold, relative_error=RELATIVE_ERROR):
        """

        :param df:
        :param col_name:
        :param threshold:
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        if not is_numeric(relative_error):
            raise TypeError("Numeric expected")

        self.df = df
        self.threshold = threshold
        self.relative_error = relative_error

        self.col_name = one_list_to_val(parse_columns(df, col_name))
Пример #3
0
    def __init__(self, df, col_name, threshold):
        """

        :param df: Spark Dataframe
        :param col_name:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        self.df = df

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")
        self.threshold = threshold

        self.col_name = one_list_to_val(parse_columns(df, col_name))
Пример #4
0
def parse_col_names_funcs_to_keys(data):
    from optimus.helpers.check import is_numeric, is_nan
    """
    Helper function that return a formatted json with function:value inside columns. Transform from
    {'max_antiguedad_anos': 15,
    'max_m2_superficie_construida': 1800000,
    'min_antiguedad_anos': 2,
    'min_m2_superficie_construida': 20}

    to

    {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}}

    :param data: json data
    :return: json
    """
    functions_array = [
        "range", "count_uniques", "count_na", "min", "max", "stddev",
        "kurtosis", "mean", "skewness", "sum", "variance",
        "approx_count_distinct", "countDistinct", "na", "zeros", "percentile",
        "count", "hist"
    ]

    _result = {}
    for k, v in data[0].items():
        for f in functions_array:

            temp_func_name = f + "_"
            if k.startswith(temp_func_name):
                _col_name = k[len(temp_func_name):]
                if is_nan(v):
                    logger.print(
                        "'{FUNCTION}' function in '{COL_NAME}' column is returning 'nan'. Is that what you expected?. "
                        "Seems that '{COL_NAME}' has 'nan' values".format(
                            FUNCTION=f, COL_NAME=_col_name))
                # If the value is numeric only get 5 decimals
                elif is_numeric(v):
                    v = round(v, 5)
                _result.setdefault(_col_name, {})[f] = v

                break

    return _result
Пример #5
0
    def create(self,
               obj,
               method,
               suffix=None,
               output="df",
               additional_method=None,
               *args,
               **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param obj: Object to be tested
        :param method: Method to be tested
        :param suffix: The test name will be create using the method param. suffix will add a string in case you want
        to customize the test name.
        :param output: can be a 'df' or a 'json'
        :param additional_method:
        :param args: Arguments to be used in the method
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        # Create name
        name = []

        if method is not None:
            name.append(method.replace(".", "_"))

        if additional_method is not None:
            name.append(additional_method)

        if suffix is not None:
            name.append(suffix)

        test_name = "_".join(name)

        func_test_name = "test_" + test_name + "()"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        if not output == "dict":
            add_buffer("@staticmethod\n")
            func_test_name = "test_" + test_name + "()"
        else:
            func_test_name = "test_" + test_name + "(self)"

        filename = test_name + ".test"

        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if obj is None:
            # Use the main df
            df_func = self.df
        elif isinstance(obj, pyspark.sql.dataframe.DataFrame):

            source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n"
            df_func = obj
            add_buffer(source_df)
        else:
            source = get_var_name(obj)
            df_func = obj

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))

            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)

            else:
                _args.append(get_var_name(v))

            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if method is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            am = ""
            if additional_method:
                am = "." + additional_method + "()"

            add_buffer("\tactual_df =" + source + "." + method + "(" + _args +
                       separator + ','.join(_kwargs) + ")" + am + "\n")

        # Apply function to the dataframe
        if method is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in method.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        # Additional Methods
        if additional_method is not None:
            df_result = getattr(df_result, additional_method)()

        if output == "df":

            df_result.table()
            expected = "\texpected_df = op.create.df(" + df_result.export(
            ) + ")\n"
        elif output == "json":
            print(df_result)

            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        elif output == "dict":
            print(df_result)

            expected = "\texpected_value =" + df_result + "\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        # Output
        if output == "df":
            add_buffer(
                "\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert(expected_value == actual_df)\n")
        elif output == "dict":
            add_buffer(
                "\tself.assertDictEqual(deep_sort(expected_value),  deep_sort(actual_df))\n"
            )

        filename = self.path + "//" + filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Write file
        test_file = open(filename, 'w', encoding='utf-8')

        for b in buffer:
            test_file.write(b)