示例#1
0
def melt(self,
         id_vars,
         value_vars,
         var_name="variable",
         value_name="value",
         data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self:
    :param id_vars:
    :param value_vars:
    :param var_name: column name for vars
    :param value_name: column name for values
    :param data_type: because all data must have the same type
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all colums to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [
        F.struct(F.lit(c).alias(var_name),
                 F.col(c).alias(value_name)) for c in value_vars
    ]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [
        F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    return df.select(*cols)
示例#2
0
文件: jdbc.py 项目: schatzr/Optimus
    def table_to_df(self, table_name, columns="*", limit=None):
        """
        Return cols as Spark dataframe from a specific table
        :type table_name: object
        :param columns:
        :param limit: how many rows will be retrieved
        """

        db_table = "public." + table_name
        if self._limit(limit) is "":
            query = "SELECT COUNT(*) FROM " + db_table
            # We want to count the number of rows to warn the users how much it can take to bring the whole data
            count = self.execute(query, "all").to_json()[0]["count"]

            print(str(count) + " rows")

        if columns is "*":
            columns_sql = "*"
        else:
            columns = val_to_list(columns)
            columns_sql = ",".join(columns)

        query = "SELECT " + columns_sql + " FROM " + db_table
        logger.print(query)
        df = self.execute(query, limit)

        # Bring the data to local machine if not every time we call an action is going to be
        # retrieved from the remote server
        df = df.run()
        return df
示例#3
0
def melt(self,
         id_vars,
         value_vars,
         var_name="variable",
         value_name="value",
         data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self: Spark Dataframe
    :param id_vars: column with unique values
    :param value_vars: Column names that are going to be converted to columns values
    :param var_name: Column name for vars
    :param value_name: Column name for values
    :param data_type: All columns must have the same type. It will transform all collumns to this datatype.
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all colums to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [
        F.struct(F.lit(c).alias(var_name),
                 F.col(c).alias(value_name)) for c in value_vars
    ]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [
        F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    return df.select(*cols)
示例#4
0
    def append(rows):
        """
        Append a row at the end of a dataframe
        :param rows: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self

        if is_list_of_tuples(rows):
            columns = [str(i) for i in range(df.cols.count())]
            if not is_list_of_tuples(rows):
                rows = [tuple(rows)]
            new_row = Create().df(columns, rows)
            df_result = df.union(new_row)

        elif is_list_of_dataframes(rows) or is_dataframe(rows):
            row = val_to_list(rows)
            row.insert(0, df)
            df_result = append_df(row, like="rows")
        else:
            RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"])

        df_result = df_result.preserve_meta(self, Actions.NEST.value,
                                            df.cols.names())

        return df_result
示例#5
0
    def _exprs(funcs, columns):
        """
        Helper function to apply multiple columns expression to multiple columns
        :param funcs: Aggregation functions from Apache Spark
        :param columns: list or string of columns names or a .
        :return:
        """
        def parse_col_names_funcs_to_keys(data):
            """
            Helper function that return a formatted json with function:value inside columns. Transform from
            {'max_antiguedad_anos': 15,
            'max_m2_superficie_construida': 1800000,
            'min_antiguedad_anos': 2,
            'min_m2_superficie_construida': 20}

            to

            {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}}

            :param data: json data
            :return: json
            """
            functions_array = [
                "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum",
                "variance", "approx_count_distinct", "na", "zeros",
                "percentile"
            ]
            result = {}
            if is_dict(data):
                for k, v in data.items():
                    for f in functions_array:
                        temp_func_name = f + "_"
                        if k.startswith(temp_func_name):
                            _col_name = k[len(temp_func_name):]
                            result.setdefault(_col_name, {})[f] = v
                return result
            else:
                return data

        columns = parse_columns(self, columns)

        # Ensure that is a list
        funcs = val_to_list(funcs)

        df = self

        # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving
        # unexpected results
        # df = df.cols.cast(columns, "float")

        # Create a Column Expression for every column
        exprs = []
        for col_name in columns:
            for func in funcs:
                exprs.append(
                    func(col_name).alias(func.__name__ + "_" + col_name))

        return (parse_col_names_funcs_to_keys(
            format_dict(df.agg(*exprs).to_json())))
示例#6
0
    def replace(columns, search_and_replace=None, value=None, regex=None):
        """
        Replace a value or a list of values by a specified string
        :param columns: '*', list of columns names or a single column name.
        :param search_and_replace: values to look at to be replaced
        :param value: new value to replace the old one
        :param regex:
        :return:
        """
        replace = None
        search = None

        if is_list_of_tuples(search_and_replace):
            params = list(zip(*search_and_replace))
            search = list(params[0])
            replace = list(params[1])

        elif is_list(search_and_replace):
            search = search_and_replace
            replace = value

        elif is_one_element(search_and_replace):
            search = val_to_list(search_and_replace)
            replace = value

        if regex:
            search = search_and_replace
            replace = value

        # if regex or normal replace we use regexp or replace functions
        # TODO check if .contains can be used instead of regexp
        def func_regex(_df, _col_name, _search, _replace):
            return _df.withColumn(
                c, F.regexp_replace(_col_name, _search, _replace))

        def func_replace(_df, _col_name, _search, _replace):
            data_type = self.cols.dtypes(_col_name)
            _search = [PYTHON_TYPES_[data_type](s) for s in _search]
            _df = _df.replace(_search, _replace, _col_name)
            return _df

        if regex:
            func = func_regex
        else:
            func = func_replace

        df = self

        columns = parse_columns(self,
                                columns,
                                filter_by_column_dtypes="string")
        for c in columns:
            df = func(df, c, search, replace)

        return df
示例#7
0
def columns_meta(self, value):
    """
    Shortcut to add transformations to a dataframe
    :param self:
    :param value:
    :return:
    """
    df = self
    value = val_to_list(value)
    for v in value:
        df = df.update_meta("transformations.columns", v, list)
    return df
示例#8
0
def action_meta(self, key, value):
    """
    Shortcut to add transformations to a dataframe
    :param self:
    :param key:
    :param value:
    :return:
    """
    df = self
    value = val_to_list(value)
    for v in value:
        df = df.update_meta("transformations.actions." + key, v, list)
    return df
示例#9
0
文件: jdbc.py 项目: schatzr/Optimus
    def show(self, table_names="*", limit="all"):
        db = self.db

        if table_names is "*":
            table_names = db.tables_names_to_json()
        else:
            table_names = val_to_list(table_names)

        print("Total Tables:" + str(len(table_names)))

        for table_name in table_names:
            db.table_to_df(table_name, "*", limit) \
                .table(title=table_name)
示例#10
0
    def is_in(columns, values):
        """
        Filter rows which columns that match a specific value
        :return: Spark DataFrame
        """

        # Ensure that we have a list
        values = val_to_list(values)

        # Create column/value expression
        column_expr = [(F.col(columns) == v) for v in values]

        # Concat expression with and logical or
        expr = reduce(lambda a, b: a | b, column_expr)

        return self.rows.select(expr)
示例#11
0
    def is_in(input_cols, values):
        """
        Filter rows which columns that match a specific value
        :return: Spark DataFrame
        """
        df = self

        # Ensure that we have a list
        values = val_to_list(values)

        # Create column/value expression
        column_expr = [(F.col(input_cols) == v) for v in values]

        # Concat expression with and logical or
        expr = reduce(lambda a, b: a | b, column_expr)
        df = df.rows.select(expr)
        df = df.preserve_meta(self, Actions.DROP_ROW.value, input_cols)
        return df
示例#12
0
    def impute(input_cols, output_cols, strategy="mean"):
        """
        Imputes missing data from specified columns using the mean or median.
        :param input_cols: List of columns to be analyze.
        :param output_cols: List of output columns with missing values imputed.
        :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median"
        :return: Dataframe object (DF with columns that has the imputed values).
        """

        input_cols = parse_columns(self, input_cols)
        output_cols = val_to_list(output_cols)

        imputer = Imputer(inputCols=input_cols, outputCols=output_cols)

        df = self
        model = imputer.setStrategy(strategy).fit(df)
        df = model.transform(df)

        return df
示例#13
0
    def value_error(var, data_values):
        """
        Raise a ValueError exception
        :param var:
        :param data_values:
        :param _list: list of values accepted
        :return:
        """
        from optimus.helpers.functions import get_var_name, val_to_list

        data_values = val_to_list(data_values)

        if len(data_values) == 1:
            divisor = ""
        elif len(data_values) == 2:
            divisor = " or "
        elif len(data_values) > 2:
            divisor = ", "

        raise ValueError(
            "'{var_name}' must be {type}, received '{var_type}'".format(
                var_name=get_var_name(var),
                type=divisor.join(map(lambda x: "'" + x + "'", data_values)),
                var_type=var))
示例#14
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 dl=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 options=None,
                 additional_options=None,
                 enricher_host="localhost",
                 enricher_port=27017,
                 queue_url=None,
                 queue_exchange=None,
                 queue_routing_key="optimus"):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.1.0/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """
        if session is None:
            # print("Creating Spark Session...")
            # If a Spark session in not passed by argument create it

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            if packages is None:
                packages = []
            else:
                packages = val_to_list(packages)

            self.packages = packages
            self.repositories = repositories

            if jars is None:
                jars = {}

            self.jars = jars
            self.additional_options = additional_options

            self.verbose(verbose)

            # Load Avro.
            # TODO: if the Spark 2.4 version is going to be used this is not neccesesary.
            #  Maybe we can check a priori which version fo Spark is going to be used
            # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"])

            if dl is True:
                self._add_spark_packages(
                    ["databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11"])

                self._start_session()

                from optimus.dl.models import DL
                self.dl = DL()
            else:
                self._start_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments  just save the reference
            Spark.instance = session

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read
        self.profiler = Profiler(queue_url=queue_url,
                                 queue_exchange=queue_exchange,
                                 queue_routing_key=queue_routing_key)
        self.ml = ML()
        self.enricher = Enricher(
            op=self,
            host=enricher_host,
            port=enricher_port,
        )