예제 #1
0
    def replace(columns, search_and_replace=None, value=None, regex=None):
        """
        Replace a value or a list of values by a specified string
        :param columns: '*', list of columns names or a single column name.
        :param search_and_replace: values to look at to be replaced
        :param value: new value to replace the old one
        :param regex:
        :return:
        """
        replace = None
        search = None

        if is_list_of_tuples(search_and_replace):
            params = list(zip(*search_and_replace))
            search = list(params[0])
            replace = list(params[1])

        elif is_list(search_and_replace):
            search = search_and_replace
            replace = value

        elif is_one_element(search_and_replace):
            search = val_to_list(search_and_replace)
            replace = value

        if regex:
            search = search_and_replace
            replace = value

        # if regex or normal replace we use regexp or replace functions
        # TODO check if .contains can be used instead of regexp
        def func_regex(_df, _col_name, _search, _replace):
            return _df.withColumn(
                c, F.regexp_replace(_col_name, _search, _replace))

        def func_replace(_df, _col_name, _search, _replace):
            data_type = self.cols.dtypes(_col_name)
            _search = [PYTHON_TYPES_[data_type](s) for s in _search]
            _df = _df.replace(_search, _replace, _col_name)
            return _df

        if regex:
            func = func_regex
        else:
            func = func_replace

        df = self

        columns = parse_columns(self,
                                columns,
                                filter_by_column_dtypes="string")
        for c in columns:
            df = func(df, c, search, replace)

        return df
예제 #2
0
    def data_frame(cols=None, rows=None, infer_schema=True, pdf=None):
        """
        Helper to create a Spark dataframe:
        :param cols: List of Tuple with name, data type and a flag to accept null
        :param rows: List of Tuples with the same number and types that cols
        :param infer_schema: Try to infer the schema data type.
        :param pdf: a pandas dataframe
        :return: Dataframe
        """
        if is_(pdf, pd.DataFrame):
            result = Spark.instance.spark.createDataFrame(pdf)
        else:

            specs = []
            # Process the rows
            if not is_list_of_tuples(rows):
                rows = [(i, ) for i in rows]

            # Process the columns
            for c, r in zip(cols, rows[0]):
                # Get columns name

                if is_one_element(c):
                    col_name = c

                    if infer_schema is True:
                        var_type = infer(r)
                    else:
                        var_type = StringType()
                    nullable = True

                elif is_tuple(c):

                    # Get columns data type
                    col_name = c[0]
                    var_type = get_spark_dtypes_object(c[1])

                    count = len(c)
                    if count == 2:
                        nullable = True
                    elif count == 3:
                        nullable = c[2]

                # If tuple has not the third param with put it to true to accepts Null in columns
                specs.append([col_name, var_type, nullable])

            struct_fields = list(map(lambda x: StructField(*x), specs))

            result = Spark.instance.spark.createDataFrame(
                rows, StructType(struct_fields))

        return result