def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") if is_(df.cols.schema_dtype(col_name), (float, int)): expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) elif is_(df.cols.schema_dtype(col_name), (NullType)): expr.append(F.count(col_name).alias(col_name)) else: expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(df.select(*expr).to_json()) return result
def traverse(obj, path=None, callback=None): """ Traverse a deep nested python structure :param obj: object to traverse :param path: :param callback: Function used to transform a value :return: """ if path is None: path = [] if is_(obj, dict): value = {k: traverse(v, path + [k], callback) for k, v in obj.items()} elif is_(obj, list): value = [traverse(elem, path + [[]], callback) for elem in obj] elif is_(obj, tuple): value = tuple(traverse(elem, path + [[]], callback) for elem in obj) elif is_(obj, DenseVector): value = DenseVector( [traverse(elem, path + [[]], callback) for elem in obj]) else: value = obj if callback is None: # if a callback is provided, call it to get the new value return value else: return callback(path, value)
def send(self, df): """ Send the dataframe to the mongo collection :param df: dataframe to be send to the enricher :return: """ if is_(df, pd.DataFrame): self.get_collection(self.collection_name).insert_many(df.to_dict("records")) elif is_(df, DataFrame): df.save.mongo(self.host, self.port, self.db_name, self.collection_name) else: raise Exception("df must by a Spark Dataframe or Pandas Dataframe")
def append(col_name=None, value=None): """ Append a column to a Dataframe :param col_name: Name of the new column :param value: List of data values :return: """ def lit_array(_value): temp = [] for v in _value: temp.append(F.lit(v)) return F.array(temp) df = self if is_num_or_str(value): value = F.lit(value) elif is_list(value): value = lit_array(value) elif is_tuple(value): value = lit_array(list(value)) if is_(value, F.Column): df = df.withColumn(col_name, value) return df
def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ df = self if has_(input_cols, F.Column): # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) if shape is "vector": columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) vector_assembler = VectorAssembler( inputCols=columns, outputCol=output_col) df = vector_assembler.transform(df) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True): """ Apply a expression to column. :param columns: Columns in which the function is going to be applied :param func: function to be applied :type func: A plain expression or a function :param args: Argument passed to the function :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool :param verbose: Print additional information about :return: Dataframe """ # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_(func, F.Column): _func = func_col_exp else: _func = func columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True) df = self for col_name in columns: df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose)) return df
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") expr.append( F.count( F.when( F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(collect_as_dict(df.select(*expr).collect())) return result
def data_frame(cols=None, rows=None, infer_schema=True, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples with the same number and types that cols :param infer_schema: Try to infer the schema data type. :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): result = Spark.instance.spark.createDataFrame(pdf) else: specs = [] # Process the rows if not is_list_of_tuples(rows): rows = [(i, ) for i in rows] # Process the columns for c, r in zip(cols, rows[0]): # Get columns name if is_one_element(c): col_name = c if infer_schema is True: var_type = infer(r) else: var_type = StringType() nullable = True elif is_tuple(c): # Get columns data type col_name = c[0] var_type = get_spark_dtypes_object(c[1]) count = len(c) if count == 2: nullable = True elif count == 3: nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) struct_fields = list(map(lambda x: StructField(*x), specs)) result = Spark.instance.spark.createDataFrame( rows, StructType(struct_fields)) return result
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: assert isinstance(input_cols, (str, list)), \ "Error: %s argument must be a string or a list." % "input_cols" if isinstance(input_cols, str): input_cols = [input_cols] assert isinstance( p, (float, int)), "Error: p argument must be a numeric value." # Convert ArrayType() column to DenseVector def arr_to_vec(arr_column): """ :param arr_column: Column name :return: Returns DenseVector by converting an ArrayType() column """ return DenseVector(arr_column) # User-Defined function # TODO: use apply() to use Pyarrow udf_arr_to_vec = F.udf(arr_to_vec, VectorUDT()) # Check for columns which are not DenseVector types and convert them into DenseVector for col in input_cols: if not is_(df[col], DenseVector): df = df.withColumn(col, udf_arr_to_vec(df[col])) normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def unnest(columns, mark=None, splits=None, index=None): """ Split an array or string in different columns :param columns: Columns to be un-nested :param mark: If column is string. :param splits: Number of rows to un-nested. Because we can not know beforehand the number of splits :param index: :return: Spark DataFrame """ # If a number of split was not defined try to infer the length with the first element infer_splits = None if splits is None: infer_splits = True columns = parse_columns(self, columns) df = self for col_name in columns: # if the col is array col_dtype = self.schema[col_name].dataType # Array if is_(col_dtype, ArrayType): expr = F.col(col_name) # Try to infer the array length using the first row if infer_splits is True: splits = len(self.cols.cell(col_name)) for i in builtins.range(splits): df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # String elif is_(col_dtype, StringType): expr = F.split(F.col(col_name), mark) # Try to infer the array length using the first row if infer_splits is True: splits = len(self.cols.cell(col_name).split(mark)) if is_int(index): r = builtins.range(index, index + 1) else: r = builtins.range(0, splits) for i in r: df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # Vector elif is_(col_dtype, VectorUDT): def _unnest(row): _dict = row.asDict() # Get the column we want to unnest _list = _dict[col_name] # Ensure that float are python floats and not np floats if index is None: _list = [float(x) for x in _list] else: _list = [float(_list[1])] return row + tuple(_list) df = df.rdd.map(_unnest).toDF(df.columns) return df
def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", calls=60, period=60, max_tries=8): """ Read a the url key from a mongo collection an make a request to a service :param df: Dataframe to me loaded to the enricher collection. :param collection_name: Custom collection to save the data. :param func_request: help to create a custom request :param func_response: help to create a custom response :param return_type: :param calls: how many call can you make :param period: in which period ot time can the call be made :param max_tries: how many retries should we do :return: """ # Load the dataframe data in the enricher if is_(df, DataFrame): df = df.create_id(COL_ID) # Load the dataframe data in the enricher self.send(df) if collection_name is None: collection_name = self.collection_name collection = self.get_collection(collection_name) # Get data that is not yet enriched cursor = collection.find({COL_RESULTS: {"$exists": False}}) total_docs = cursor.count(True) if func_request is None: func_request = requests.get collection = self.get_collection(collection_name) @on_exception(expo, RateLimitException, max_tries=max_tries) @limits(calls=calls, period=period) def _func_request(v): return func_request(v) if total_docs > 0: for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): # Send request to the API response = _func_request(c) mongo_id = c["_id"] if response.status_code == 200: if return_type == "json": response = json.loads(response.text) elif return_type == "text": response = response.text # Process the result with an external function if is_function(func_response): response = func_response(response) # Update the mongo id with the result collection.find_and_modify( query={"_id": mongo_id}, update={"$set": { COL_RESULTS: response }}, upsert=False, full_response=True) else: # The response key will remain blank so we can filter it to try in future request logging.info(response.status_code) # Append the data in enrichment to the dataframe logging.info("Appending collection info into the dataframe") # TODO: An elegant way to handle pickling? # take care to the pickling host = self.host port = self.port db_name = self.db_name @pandas_udf('string', PandasUDFType.SCALAR) def func(value): # More about pickling from pymongo import MongoClient _client = MongoClient(host, port) _db = _client[db_name] _collection = _db[collection_name] def func_serie(serie): _cursor = _collection.find_one({COL_ID: serie}, projection={ "_id": 0, COL_RESULTS: 1 }) return _cursor[COL_RESULTS] return value.apply(func_serie) df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run() # If the process is finished, flush the Mongo collection self.flush() return df else: print("No records available to process")
def unnest(columns, mark=None, n=None, index=None): """ Split array or string in different columns :param columns: Columns to be un-nested :param mark: is column is string :param n: Number of rows to un-nested :param index: :return: Spark DataFrame """ # If a number of split was not defined try to infer the lenght with the first element infer_n = None if n is None: infer_n = True columns = parse_columns(self, columns) df = self for col_name in columns: # if the col is array expr = None col_dtype = self.schema[col_name].dataType # Array if is_(col_dtype, ArrayType): expr = F.col(col_name) # Try to infer the array length using the first row if infer_n is True: n = len(self.cols.cell(col_name)) for i in builtins.range(n): df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # String elif is_(col_dtype, StringType): expr = F.split(F.col(col_name), mark) # Try to infer the array length using the first row if infer_n is True: n = len(self.cols.cell(col_name).split(mark)) if is_int(index): r = builtins.range(index, index + 1) else: r = builtins.range(0, n) for i in r: df = df.withColumn(col_name + "_" + str(i), expr.getItem(i)) # Vector elif is_(col_dtype, VectorUDT): def extract(row): return row + tuple(row.vector.toArray().tolist()) df = df.rdd.map(extract).toDF(df.columns) return df