Пример #1
0
    def append(rows):
        """
        Append a row at the end of a dataframe
        :param rows: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self

        if is_list_of_tuples(rows):
            columns = [str(i) for i in range(df.cols.count())]
            if not is_list_of_tuples(rows):
                rows = [tuple(rows)]
            new_row = op.Create.df(columns, rows)
            df_result = df.union(new_row)

        elif is_list_of_dataframes(rows) or is_dataframe(rows):
            row = val_to_list(rows)
            row.insert(0, df)
            df_result = append_df(row, like="rows")
        else:
            RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"])

        df_result = df_result.preserve_meta(self, Actions.NEST.value,
                                            df.cols.names())

        return df_result
Пример #2
0
    def data_frame(cols=None, rows=None, infer_schema=True, pdf=None):
        """
        Helper to create a Spark dataframe:
        :param cols: List of Tuple with name, data type and a flag to accept null
        :param rows: List of Tuples with the same number and types that cols
        :param infer_schema: Try to infer the schema data type.
        :param pdf: a pandas dataframe
        :return: Dataframe
        """
        if is_(pdf, pd.DataFrame):
            df = Spark.instance.spark.createDataFrame(pdf)
        else:

            specs = []
            # Process the rows
            if not is_list_of_tuples(rows):
                rows = [(i, ) for i in rows]

            # Process the columns
            for c, r in zip(cols, rows[0]):
                # Get columns name

                if is_one_element(c):
                    col_name = c

                    if infer_schema is True:
                        var_type = infer(r)
                    else:
                        var_type = StringType()
                    nullable = True

                elif is_tuple(c):

                    # Get columns data type
                    col_name = c[0]
                    var_type = parse_spark_class_dtypes(c[1])

                    count = len(c)
                    if count == 2:
                        nullable = True
                    elif count == 3:
                        nullable = c[2]

                # If tuple has not the third param with put it to true to accepts Null in columns
                specs.append([col_name, var_type, nullable])

            struct_fields = list(map(lambda x: StructField(*x), specs))

            df = Spark.instance.spark.createDataFrame(
                rows, StructType(struct_fields))
            df = df.columns_meta(df.cols.names())
        return df
Пример #3
0
    def append(row):
        """
        Append a row at the end of a dataframe
        :param row: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self
        columns = [str(i) for i in range(df.cols.count())]
        if not is_list_of_tuples(row):
            row = [tuple(row)]

        new_row = op.Create.df(columns, row)
        return df.union(new_row)
Пример #4
0
def validate_columns_names(df, col_names, index=0):
    """
    Check if a string or list of string are valid dataframe columns
    :param df: Data frame to be analyzed
    :param col_names: columns names to be checked
    :param index:
    :return:
    """

    columns = val_to_list(col_names)

    if is_list_of_tuples(columns):
        columns = [c[index] for c in columns]

    # Remove duplicates in the list
    if is_list_of_strings(columns):
        columns = OrderedSet(columns)

    check_for_missing_columns(df, columns)

    return True
Пример #5
0
def parse_columns(df,
                  cols_args,
                  get_args=False,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    attrs = None

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    if filter_by_column_dtypes is not None:
        filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type

        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols

    # Return cols or cols an params
    cols_params = []

    if invert:
        final_columns = list(OrderedSet(cols) - OrderedSet(final_columns))

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    return cols_params
Пример #6
0
    def create(self,
               obj,
               method,
               suffix=None,
               output="df",
               additional_method=None,
               *args,
               **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param obj: Object to be tested
        :param method: Method to be tested
        :param suffix: The test name will be create using the method param. suffix will add a string in case you want
        to customize the test name.
        :param output: can be a 'df' or a 'json'
        :param additional_method:
        :param args: Arguments to be used in the method
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        # Create name
        name = []

        if method is not None:
            name.append(method.replace(".", "_"))

        if additional_method is not None:
            name.append(additional_method)

        if suffix is not None:
            name.append(suffix)

        test_name = "_".join(name)

        func_test_name = "test_" + test_name + "()"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        if not output == "dict":
            add_buffer("@staticmethod\n")
            func_test_name = "test_" + test_name + "()"
        else:
            func_test_name = "test_" + test_name + "(self)"

        filename = test_name + ".test"

        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if obj is None:
            # Use the main df
            df_func = self.df
        elif isinstance(obj, pyspark.sql.dataframe.DataFrame):

            source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n"
            df_func = obj
            add_buffer(source_df)
        else:
            source = get_var_name(obj)
            df_func = obj

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))

            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)

            else:
                _args.append(get_var_name(v))

            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if method is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            am = ""
            if additional_method:
                am = "." + additional_method + "()"

            add_buffer("\tactual_df =" + source + "." + method + "(" + _args +
                       separator + ','.join(_kwargs) + ")" + am + "\n")

        # Apply function to the dataframe
        if method is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in method.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        # Additional Methods
        if additional_method is not None:
            df_result = getattr(df_result, additional_method)()

        if output == "df":

            df_result.table()
            expected = "\texpected_df = op.create.df(" + df_result.export(
            ) + ")\n"
        elif output == "json":
            print(df_result)

            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        elif output == "dict":
            print(df_result)

            expected = "\texpected_value =" + df_result + "\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        # Output
        if output == "df":
            add_buffer(
                "\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert(expected_value == actual_df)\n")
        elif output == "dict":
            add_buffer(
                "\tself.assertDictEqual(deep_sort(expected_value),  deep_sort(actual_df))\n"
            )

        filename = self.path + "//" + filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Write file
        test_file = open(filename, 'w', encoding='utf-8')

        for b in buffer:
            test_file.write(b)