예제 #1
0
파일: outliers.py 프로젝트: polya20/Optimus
    def z_score(df, columns, threshold=None):
        """
        Delete outlier using z score
        :param df:
        :param columns:
        :param threshold:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)

        for c in columns:
            # the column with the z_col value is always the string z_col plus the name of column
            z_col = "z_col_" + c

            df = df.cols.z_score(c) \
                .rows.drop(F.col(z_col) > threshold) \
                .cols.drop(z_col)

        return df
예제 #2
0
    def rename(columns_old_new=None, func=None):
        """"
        Changes the name of a column(s) dataFrame.
        :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName).
        :param func: can be lower, upper or any string transformation function
        """

        df = self

        # Apply a transformation function
        if is_function(func):
            exprs = [F.col(c).alias(func(c)) for c in df.columns]
            df = df.select(exprs)

        elif is_list_of_tuples(columns_old_new):
            # Check that the 1st element in the tuple is a valid set of columns

            validate_columns_names(self, columns_old_new)
            for c in columns_old_new:
                old_col_name = c[0]
                if is_str(old_col_name):
                    df = df.withColumnRenamed(old_col_name, c[1])
                elif is_int(old_col_name):
                    df = df.withColumnRenamed(self.schema.names[old_col_name], c[1])

        return df
예제 #3
0
    def _mad(self, action):
        """

               :type action:
               :return:
               """

        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)
        for c in columns:
            mad_value = df.cols.mad(c, more=True)
            lower_bound = mad_value["median"] - threshold * mad_value["mad"]
            upper_bound = mad_value["median"] + threshold * mad_value["mad"]

            if action is "select":
                df = df.rows.select((F.col(c) > upper_bound)
                                    | (F.col(c) < lower_bound))
            elif action is "drop":
                df = df.rows.drop((F.col(c) > upper_bound)
                                  | (F.col(c) < lower_bound))
        return df
예제 #4
0
    def length_error(var1, var2):
        """
        Raise a ValueError exception
        :param var1:
        :param var2:
        :return:
        """
        from optimus.helpers.functions import get_var_name

        if is_int(var2):
            length_var2 = str(var2)
        else:
            length_var2 = str(len(var2))

        raise ValueError(
            "'{var2_name}' must be length '{var1_length}', received '{var2_length}'"
            .format(var2_name=get_var_name(var2),
                    var1_length=str(len(var1)),
                    var2_length=length_var2))
예제 #5
0
파일: outliers.py 프로젝트: polya20/Optimus
    def mad(df, columns, threshold=None):
        """
        Delete outlier using mad
        :param df:
        :param columns:
        :param threshold:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)
        for c in columns:
            mad_value = df.cols.mad(c, more=True)
            lower_bound = mad_value["median"] - threshold * mad_value["mad"]
            upper_bound = mad_value["median"] + threshold * mad_value["mad"]

            df = df.rows.drop((F.col(c) > upper_bound)
                              | (F.col(c) < lower_bound))
        return df
예제 #6
0
    def unnest(columns, mark=None, splits=None, index=None):
        """
        Split an array or string in different columns
        :param columns: Columns to be un-nested
        :param mark: If column is string.
        :param splits: Number of rows to un-nested. Because we can not know beforehand the number of splits
        :param index:
        :return: Spark DataFrame
        """

        # If a number of split was not defined try to infer the length with the first element
        infer_splits = None
        if splits is None:
            infer_splits = True

        columns = parse_columns(self, columns)

        df = self

        for col_name in columns:
            # if the col is array

            col_dtype = self.schema[col_name].dataType

            # Array
            if is_(col_dtype, ArrayType):

                expr = F.col(col_name)
                # Try to infer the array length using the first row
                if infer_splits is True:
                    splits = len(self.cols.cell(col_name))

                for i in builtins.range(splits):
                    df = df.withColumn(col_name + "_" + str(i), expr.getItem(i))

            # String
            elif is_(col_dtype, StringType):
                expr = F.split(F.col(col_name), mark)
                # Try to infer the array length using the first row
                if infer_splits is True:
                    splits = len(self.cols.cell(col_name).split(mark))

                if is_int(index):
                    r = builtins.range(index, index + 1)
                else:
                    r = builtins.range(0, splits)

                for i in r:
                    df = df.withColumn(col_name + "_" + str(i), expr.getItem(i))

            # Vector
            elif is_(col_dtype, VectorUDT):

                def _unnest(row):
                    _dict = row.asDict()

                    # Get the column we want to unnest
                    _list = _dict[col_name]

                    # Ensure that float are python floats and not np floats
                    if index is None:
                        _list = [float(x) for x in _list]
                    else:
                        _list = [float(_list[1])]

                    return row + tuple(_list)

                df = df.rdd.map(_unnest).toDF(df.columns)

        return df
예제 #7
0
    def unnest(columns, mark=None, n=None, index=None):
        """
        Split array or string in different columns
        :param columns: Columns to be un-nested
        :param mark: is column is string
        :param n: Number of rows to un-nested
        :param index:
        :return: Spark DataFrame
        """

        # If a number of split was not defined try to infer the lenght with the first element
        infer_n = None
        if n is None:
            infer_n = True

        columns = parse_columns(self, columns)

        df = self

        for col_name in columns:
            # if the col is array
            expr = None

            col_dtype = self.schema[col_name].dataType

            # Array
            if is_(col_dtype, ArrayType):

                expr = F.col(col_name)
                # Try to infer the array length using the first row
                if infer_n is True:
                    n = len(self.cols.cell(col_name))

                for i in builtins.range(n):
                    df = df.withColumn(col_name + "_" + str(i),
                                       expr.getItem(i))

            # String
            elif is_(col_dtype, StringType):
                expr = F.split(F.col(col_name), mark)
                # Try to infer the array length using the first row
                if infer_n is True:
                    n = len(self.cols.cell(col_name).split(mark))

                if is_int(index):
                    r = builtins.range(index, index + 1)
                else:
                    r = builtins.range(0, n)

                for i in r:
                    df = df.withColumn(col_name + "_" + str(i),
                                       expr.getItem(i))

            # Vector
            elif is_(col_dtype, VectorUDT):

                def extract(row):
                    return row + tuple(row.vector.toArray().tolist())

                df = df.rdd.map(extract).toDF(df.columns)

        return df