def traverse(obj, path=None, callback=None): """ Traverse a deep nested python structure :param obj: object to traverse :param path: :param callback: Function used to transform a value :return: """ if path is None: path = [] if is_(obj, dict): value = {k: traverse(v, path + [k], callback) for k, v in obj.items()} elif is_(obj, list): value = [traverse(elem, path + [[]], callback) for elem in obj] elif is_(obj, tuple): value = tuple(traverse(elem, path + [[]], callback) for elem in obj) elif is_(obj, DenseVector): value = DenseVector( [traverse(elem, path + [[]], callback) for elem in obj]) else: value = obj if callback is None: # if a callback is provided, call it to get the new value return value else: return callback(path, value)
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/ normal = [ Normalizer(inputCol=col_name, outputCol=name_col(col_name, "normalized"), p=p) for col_name in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def load(self, df): """ Load the dataframe to the mongo collection :param df: dataframe to be send to the enricher :return: """ if is_(df, pd.DataFrame): self.get_collection(self.collection_name).insert_many( df.to_dict("records")) elif is_(df, DataFrame): df.save.mongo(self.host, self.port, self.db_name, self.collection_name) else: raise Exception("df must by a Spark Dataframe or Pandas Dataframe")
def data_frame(cols=None, rows=None, infer_schema=True, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples with the same number and types that cols :param infer_schema: Try to infer the schema data type. :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): df = Spark.instance.spark.createDataFrame(pdf) else: specs = [] # Process the rows if not is_list_of_tuples(rows): rows = [(i, ) for i in rows] # Process the columns for c, r in zip(cols, rows[0]): # Get columns name if is_one_element(c): col_name = c if infer_schema is True: var_type = infer(r) else: var_type = StringType() nullable = True elif is_tuple(c): # Get columns data type col_name = c[0] var_type = parse_spark_class_dtypes(c[1]) count = len(c) if count == 2: nullable = True elif count == 3: nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) struct_fields = list(map(lambda x: StructField(*x), specs)) df = Spark.instance.spark.createDataFrame( rows, StructType(struct_fields)) df = df.columns_meta(df.cols.names()) return df