def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: :param id_vars: :param value_vars: :param var_name: column name for vars :param value_name: column name for values :param data_type: because all data must have the same type :return: """ df = self id_vars = val_to_list(id_vars) # Cast all colums to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [ F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars ] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [ F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] return df.select(*cols)
def table_to_df(self, table_name, columns="*", limit=None): """ Return cols as Spark dataframe from a specific table :type table_name: object :param columns: :param limit: how many rows will be retrieved """ db_table = "public." + table_name if self._limit(limit) is "": query = "SELECT COUNT(*) FROM " + db_table # We want to count the number of rows to warn the users how much it can take to bring the whole data count = self.execute(query, "all").to_json()[0]["count"] print(str(count) + " rows") if columns is "*": columns_sql = "*" else: columns = val_to_list(columns) columns_sql = ",".join(columns) query = "SELECT " + columns_sql + " FROM " + db_table logger.print(query) df = self.execute(query, limit) # Bring the data to local machine if not every time we call an action is going to be # retrieved from the remote server df = df.run() return df
def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: Spark Dataframe :param id_vars: column with unique values :param value_vars: Column names that are going to be converted to columns values :param var_name: Column name for vars :param value_name: Column name for values :param data_type: All columns must have the same type. It will transform all collumns to this datatype. :return: """ df = self id_vars = val_to_list(id_vars) # Cast all colums to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [ F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars ] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [ F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] return df.select(*cols)
def append(rows): """ Append a row at the end of a dataframe :param rows: List of values or tuples to be appended :return: Spark DataFrame """ df = self if is_list_of_tuples(rows): columns = [str(i) for i in range(df.cols.count())] if not is_list_of_tuples(rows): rows = [tuple(rows)] new_row = Create().df(columns, rows) df_result = df.union(new_row) elif is_list_of_dataframes(rows) or is_dataframe(rows): row = val_to_list(rows) row.insert(0, df) df_result = append_df(row, like="rows") else: RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"]) df_result = df_result.preserve_meta(self, Actions.NEST.value, df.cols.names()) return df_result
def _exprs(funcs, columns): """ Helper function to apply multiple columns expression to multiple columns :param funcs: Aggregation functions from Apache Spark :param columns: list or string of columns names or a . :return: """ def parse_col_names_funcs_to_keys(data): """ Helper function that return a formatted json with function:value inside columns. Transform from {'max_antiguedad_anos': 15, 'max_m2_superficie_construida': 1800000, 'min_antiguedad_anos': 2, 'min_m2_superficie_construida': 20} to {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}} :param data: json data :return: json """ functions_array = [ "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum", "variance", "approx_count_distinct", "na", "zeros", "percentile" ] result = {} if is_dict(data): for k, v in data.items(): for f in functions_array: temp_func_name = f + "_" if k.startswith(temp_func_name): _col_name = k[len(temp_func_name):] result.setdefault(_col_name, {})[f] = v return result else: return data columns = parse_columns(self, columns) # Ensure that is a list funcs = val_to_list(funcs) df = self # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving # unexpected results # df = df.cols.cast(columns, "float") # Create a Column Expression for every column exprs = [] for col_name in columns: for func in funcs: exprs.append( func(col_name).alias(func.__name__ + "_" + col_name)) return (parse_col_names_funcs_to_keys( format_dict(df.agg(*exprs).to_json())))
def replace(columns, search_and_replace=None, value=None, regex=None): """ Replace a value or a list of values by a specified string :param columns: '*', list of columns names or a single column name. :param search_and_replace: values to look at to be replaced :param value: new value to replace the old one :param regex: :return: """ replace = None search = None if is_list_of_tuples(search_and_replace): params = list(zip(*search_and_replace)) search = list(params[0]) replace = list(params[1]) elif is_list(search_and_replace): search = search_and_replace replace = value elif is_one_element(search_and_replace): search = val_to_list(search_and_replace) replace = value if regex: search = search_and_replace replace = value # if regex or normal replace we use regexp or replace functions # TODO check if .contains can be used instead of regexp def func_regex(_df, _col_name, _search, _replace): return _df.withColumn( c, F.regexp_replace(_col_name, _search, _replace)) def func_replace(_df, _col_name, _search, _replace): data_type = self.cols.dtypes(_col_name) _search = [PYTHON_TYPES_[data_type](s) for s in _search] _df = _df.replace(_search, _replace, _col_name) return _df if regex: func = func_regex else: func = func_replace df = self columns = parse_columns(self, columns, filter_by_column_dtypes="string") for c in columns: df = func(df, c, search, replace) return df
def columns_meta(self, value): """ Shortcut to add transformations to a dataframe :param self: :param value: :return: """ df = self value = val_to_list(value) for v in value: df = df.update_meta("transformations.columns", v, list) return df
def action_meta(self, key, value): """ Shortcut to add transformations to a dataframe :param self: :param key: :param value: :return: """ df = self value = val_to_list(value) for v in value: df = df.update_meta("transformations.actions." + key, v, list) return df
def show(self, table_names="*", limit="all"): db = self.db if table_names is "*": table_names = db.tables_names_to_json() else: table_names = val_to_list(table_names) print("Total Tables:" + str(len(table_names))) for table_name in table_names: db.table_to_df(table_name, "*", limit) \ .table(title=table_name)
def is_in(columns, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame """ # Ensure that we have a list values = val_to_list(values) # Create column/value expression column_expr = [(F.col(columns) == v) for v in values] # Concat expression with and logical or expr = reduce(lambda a, b: a | b, column_expr) return self.rows.select(expr)
def is_in(input_cols, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame """ df = self # Ensure that we have a list values = val_to_list(values) # Create column/value expression column_expr = [(F.col(input_cols) == v) for v in values] # Concat expression with and logical or expr = reduce(lambda a, b: a | b, column_expr) df = df.rows.select(expr) df = df.preserve_meta(self, Actions.DROP_ROW.value, input_cols) return df
def impute(input_cols, output_cols, strategy="mean"): """ Imputes missing data from specified columns using the mean or median. :param input_cols: List of columns to be analyze. :param output_cols: List of output columns with missing values imputed. :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median" :return: Dataframe object (DF with columns that has the imputed values). """ input_cols = parse_columns(self, input_cols) output_cols = val_to_list(output_cols) imputer = Imputer(inputCols=input_cols, outputCols=output_cols) df = self model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def value_error(var, data_values): """ Raise a ValueError exception :param var: :param data_values: :param _list: list of values accepted :return: """ from optimus.helpers.functions import get_var_name, val_to_list data_values = val_to_list(data_values) if len(data_values) == 1: divisor = "" elif len(data_values) == 2: divisor = " or " elif len(data_values) > 2: divisor = ", " raise ValueError( "'{var_name}' must be {type}, received '{var_type}'".format( var_name=get_var_name(var), type=divisor.join(map(lambda x: "'" + x + "'", data_values)), var_type=var))
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, dl=False, server=False, repositories=None, packages=None, jars=None, options=None, additional_options=None, enricher_host="localhost", enricher_port=27017, queue_url=None, queue_exchange=None, queue_routing_key="optimus"): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.1.0/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ if session is None: # print("Creating Spark Session...") # If a Spark session in not passed by argument create it self.master = master self.app_name = app_name if options is None: options = {} self.options = options if packages is None: packages = [] else: packages = val_to_list(packages) self.packages = packages self.repositories = repositories if jars is None: jars = {} self.jars = jars self.additional_options = additional_options self.verbose(verbose) # Load Avro. # TODO: if the Spark 2.4 version is going to be used this is not neccesesary. # Maybe we can check a priori which version fo Spark is going to be used # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"]) if dl is True: self._add_spark_packages( ["databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11"]) self._start_session() from optimus.dl.models import DL self.dl = DL() else: self._start_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference Spark.instance = session # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read self.profiler = Profiler(queue_url=queue_url, queue_exchange=queue_exchange, queue_routing_key=queue_routing_key) self.ml = ML() self.enricher = Enricher( op=self, host=enricher_host, port=enricher_port, )