def n_gram_fingerprint(df, input_cols, n_size=2): """ Calculate the ngram for a fingerprinted string :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ def remote_white_spaces_remove_sort_join(value, args): # remove white spaces value = [x.replace(" ", "") for x in value] # sort and remove duplicated value = sorted(set(value)) # join the tokens back together value = "".join(value) return value input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_col = name_col(input_col, NGRAM_COL) ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) df = ( df.cols.copy(input_col, name_col( input_col, NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces( ngram_col).cols.remove_special_chars( ngram_col).cols.remove_accents(ngram_col) # For create n-grams we need an Array type column .cols.nest(input_cols=ngram_col, output_col=ngram_col, shape='array')) if Optimus.cache: df = df.cache() n_gram = NGram(n=n_size, inputCol=ngram_col, outputCol=ngram_fingerprint_col) df = n_gram.transform(df) df = df.cols.apply(ngram_fingerprint_col, remote_white_spaces_remove_sort_join, "string") return df
def info(self): self.tmp_col = name_col(self.col_name, "z_score") df = self.z_score() max_z_score = df.rows.select(F.col(self.tmp_col) > self.threshold).cols.max(self.tmp_col) return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "max_z_score": max_z_score}
def info(self): m_z_col_name = name_col(self.col_name, "modified_z_score") df = self._m_z_score() max_m_z_score = df.rows.select(F.col(m_z_col_name) > self.threshold).cols.max(m_z_col_name) return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "max_m_z_score": max_m_z_score}
def drop(self): col_name = self.col_name z_col_name = name_col(col_name, "z_score") threshold = self.threshold return self.df.cols.z_score(col_name, z_col_name) \ .rows.drop(F.col(z_col_name) > threshold) \ .cols.drop(z_col_name)
def _m_z_score(self): df = self.df col_name = self.col_name mad = df.cols.mad(col_name, self.relative_error, True) m_z_col_name = name_col(col_name, "modified_z_score") return df.withColumn(m_z_col_name, F.abs(0.6745 * (F.col(col_name) - mad["median"]) / mad["mad"]))
def correlation(self, input_cols, method="pearson", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param input_cols: Columns to be processed :param method: Method used to calculate the correlation :param output: array or json :return: """ df = self # Values in columns can not be null. Warn user input_cols = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) # try to parse the select column to float and create a vector # print(self.cols.count_na(input_cols)) # Input is not a vector transform to a vector output_col = name_col(input_cols, "correlation") if len(input_cols) > 1: for col_name in input_cols: df = df.cols.cast(col_name, "float") logger.print( "Casting {col_name} to float...".format(col_name=col_name)) df = df.cols.nest(input_cols, "vector", output_cols=output_col) corr = Correlation.corr(df, output_col, method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in input_cols: for col_name_2 in input_cols: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return {"cols": input_cols, "data": result}
def info(self): col_name = self.col_name z_col_name = name_col(col_name, "z_score") max_z_score = self.df.cols.z_score(col_name, z_col_name) \ .cols.max(z_col_name) return { "count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "max_z_score": max_z_score }
def info(self, output="dict"): self.tmp_col = name_col(self.col_name, "z_score") # df = self.z_score() df = self.df max_z_score = df.cols.max() return { "count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "max_z_score": max_z_score }
def __init__(self, df, col_name, prefix): """ :param df: Spark Dataframe :param col_name: column name """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") self.df = df self.col_name = one_list_to_val(parse_columns(df, col_name)) self.tmp_col = name_col(self.col_name, prefix)
def info(self, output: str = "dict"): m_z_col_name = name_col(self.col_name, "modified_z_score") df = self.df z_score = self.z_score max_m_z_score = df.rows.select(z_score > self.threshold).cols.max() # return { "count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "max_m_z_score": format_dict(max_m_z_score) }
def levenshtein_cluster(df, input_col): """ Return a dataframe with a string of cluster related to a string :param df: :param input_col: :return: """ # Prepare a group so we don need to apply the fingerprint to the whole data set df = df.select(input_col).groupby(input_col).agg( F.count(input_col).alias("count")) df = keycollision.fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) fingerprint_col = name_col(input_col, FINGERPRINT_COL) df_t = df.groupby(fingerprint_col).agg( F.collect_list(input_col).alias(cluster_col), F.size(F.collect_list(input_col)).alias(cluster_size_col), F.first(input_col).alias(recommended_col), F.sum("count").alias(count_col)).repartition(1) if Optimus.cache: df_t = df_t.cache() # Filter nearest string df_l = levenshtein_filter(df, input_col).repartition(1) if Optimus.cache: df_l = df_l.cache() # Create Cluster df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \ .cols.drop(fingerprint_col) \ .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"]) return df_l
def levenshtein_matrix(df, input_col): """ Create a couple of column with all the string combination :param df: :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join df = df.select(fingerprint_col).distinct().select(F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)) # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) return df
def base_clustering_function(df, input_cols, output, func=None, args=None): """ Cluster a dataframe column based on the Fingerprint algorithm :return: """ # df = self.df input_cols = parse_columns(df, input_cols) result = {} for input_col in input_cols: output_col = name_col(input_col, FINGERPRINT_COL) count_col = name_col(input_col, COUNT_COL) # Instead of apply the fingerprint to the whole data set we group by names df = (df.groupBy(input_col).agg( F.count(input_col).alias(count_col)).select( count_col, input_col)).h_repartition(1) # Calculate the fingerprint recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_col = name_col(input_col, CLUSTER_COL) cluster_sum_col = name_col(input_col, CLUSTER_SUM_COL) cluster_count_col = name_col(input_col, CLUSTER_COUNT_COL) # Apply function cluster df = func(df, *args) df = df.withColumn(cluster_col, F.create_map([input_col, count_col])) df = df.groupBy(output_col).agg( F.max(F.struct(F.col(count_col), F.col(input_col))).alias(recommended_col), F.collect_list(cluster_col).alias(cluster_col), F.count(output_col).alias(cluster_count_col), F.sum(count_col).alias(cluster_sum_col)) df = df.select(recommended_col + "." + input_col, cluster_col, cluster_count_col, cluster_sum_col) for row in df.collect(): _row = list(row.asDict().values()) # List of dict to dict flatted_dict = { k: v for element in _row[1] for k, v in element.items() } result[_row[0]] = { "similar": flatted_dict, "count": _row[2], "sum": _row[3] } if output == "json": result = dump_json(result) return result
def __init__(self, df, col_name, threshold): """ :para df: :param col_name: :param threshold: """ if not is_numeric(threshold): raise TypeError("Numeric expected") self.df = df self.threshold = threshold self.col_name = one_list_to_val(parse_columns(df, col_name)) self.tmp_col = name_col(col_name, "z_score") self.z_score = self.df[self.col_name].cols.z_score() # print("self.df_score",self.df_score) # print("self.df",self.df) super().__init__()
def n_gram_fingerprint(df, input_cols, n_size=2): """ Calculate the ngram for a fingerprinted string :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ def calculate_ngrams(value, args): # remove white spaces ngram = list(ngrams(value, n_size)) # sort and remove duplicated ngram = sorted(set(ngram)) _result = "" for item in ngram: for i in item: _result = _result + i # join the tokens back together _result = "".join(_result) return _result input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_fingerprint_col = name_col(input_col, FINGERPRINT_COL) # ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) df = (df .cols.copy(input_col, ngram_fingerprint_col) .cols.lower(ngram_fingerprint_col) .cols.remove_white_spaces(ngram_fingerprint_col) .cols.apply(ngram_fingerprint_col, calculate_ngrams, "string", output_cols=ngram_fingerprint_col) .cols.remove_special_chars(ngram_fingerprint_col) .cols.normalize_chars(ngram_fingerprint_col) ) return df
def __init__(self, df, col_name, threshold): """ :para df: :param col_name: :param threshold: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") self.df = df self.threshold = threshold self.col_name = one_list_to_val(parse_columns(df, col_name)) self.tmp_col = name_col(col_name, "z_score") self.df_score = self.z_score() super().__init__(self.df_score, col_name, "z_score")
def one_hot_encoder(df, input_cols, **kargs): """ Maps a column of label indices to a column of binary vectors, with at most a single one-value. :param df: Dataframe to be transformed :param input_cols: Columns to be encoded. :return: Dataframe with encoded columns. """ input_cols = parse_columns(df, input_cols) encode = [ OneHotEncoder(inputCol=column, outputCol=name_col(column, "_encoded"), **kargs) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=encode) df = pipeline.fit(df).transform(df) return df
def vector_assembler(df, input_cols, output_col=None): """ Combines a given list of columns into a single vector column. :param df: Dataframe to be transformed. :param input_cols: Columns to be assembled. :param output_col: Column where the output is going to be saved. :return: Dataframe with assembled column. """ input_cols = parse_columns(df, input_cols) if output_col is None: output_col = name_col(input_cols, "vector_assembler") assembler = [VectorAssembler(inputCols=input_cols, outputCol=output_col)] pipeline = Pipeline(stages=assembler) df = pipeline.fit(df).transform(df) return df
def levenshtein_filter(df, input_col): """ Get the nearest string :param df: Spark Dataframe :param input_col: :return: """ # TODO: must filter by and exprs func = F.min distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" temp_r = "TEMP_R" df = levenshtein_matrix(df, input_col) # get the closest word df_r = (df.rows.drop(F.col(distance_col) == 0) .groupby(temp_col_1) .agg(func(distance_col).alias(distance_r_col)) .cols.rename(temp_col_1, temp_r)) if Optimus.cache: df_r = df_r.cache() df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))).select( temp_col_1, distance_col, temp_col_2) if Optimus.cache: df = df.cache() df = df \ .cols.rename([(temp_col_1, input_col + "_FROM"), (temp_col_2, input_col + "_TO")]) return df
def index_to_string(df, input_cols, output_col=None, **kargs): """ Maps a column of indices back to a new column of corresponding string values. The index-string mapping is either from the ML attributes of the input column, or from user-supplied labels (which take precedence over ML attributes). :param df: Dataframe to be transformed. :param input_cols: Columns to be indexed. :param output_col: Column where the output is going to be saved. :return: Dataframe with indexed columns. """ input_cols = parse_columns(df, input_cols) if output_col is None: output_col = name_col(input_cols, "index_to_string") indexers = [IndexToString(inputCol=column, outputCol=output_col, **kargs) for column in list(set(input_cols))] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) return df
def string_to_index(df, input_cols, **kargs): """ Maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. :param df: Dataframe to be transformed :param input_cols: Columns to be indexed. :param output_cols: :return: Dataframe with indexed columns. """ input_cols = parse_columns(df, input_cols) indexers = [ StringIndexer(inputCol=input_col, outputCol=name_col(input_col, "index"), **kargs).fit(df) for input_col in list(set(input_cols)) ] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) return df
def h2o_automl(df, label, columns, **kwargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) automl = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes seed=1, maxModels=3, labelCol=name_col(label, "index_to_string"), **kwargs) model = automl.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn( "prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def normalizer(df, input_cols, output_col=None, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param output_col: Column where the output is going to be saved. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datat ype: if not is_(input_cols, (str, list)): RaiseIt.type_error(input_cols, ["str", "list"]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, (float, int)): RaiseIt.type_error(input_cols, ["float", "int"]) # Try to create a vector if len(input_cols) > 1: df = df.cols.cast(input_cols, "vector") if output_col is None: output_col = name_col(input_cols, "normalizer") # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/ normal = [ Normalizer(inputCol=col_name, outputCol=output_col, p=p) for col_name in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def random_forest(df, columns, input_col, **kargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ columns = parse_columns(df, columns) data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = RandomForestClassifier(**kargs) df = df.cols.rename(name_col(input_col, "index"), "label") rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def levenshtein_cluster(df, input_col, threshold: int = None, output: str = "dict"): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: Column to be processed :param threshold: number :param output: "dict" or "json" :return: """ # Create fingerprint df_fingerprint = keycollision.fingerprint(df, input_col) # Names fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" count = "count" # Prepare the columns to calculate the cross join fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \ .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2), F.count(input_col).alias(count)) \ .select(temp_col_1, temp_col_2, count).collect() df = df_fingerprint.select( input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" if threshold is None: where = ((F.col(distance_col) == 0) & (F.col(temp_col_1) != F.col(temp_col_2))) else: where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold) df_r = ( df.rows.drop(where).cols.replace( distance_col, 0, None, search_by="numeric").groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)) # .cols.rename(distance_col, distance_r_col) .cols.rename(temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2), F.count(temp_col_2))) # Replace ngram per string kv_dict = {} for row in fingerprint_count: _row = list(row.asDict().values()) kv_dict[_row[1]] = {_row[0]: _row[2]} result = {} for row in df.collect(): _row = list(row.asDict().values()) d = {} for i in _row[1]: key = list(kv_dict[i].keys())[0] value = list(kv_dict[i].values())[0] d[key] = value key = list(kv_dict[_row[0]].keys())[0] value = list(kv_dict[_row[0]].values())[0] d.update({key: value}) result[key] = d # Calculate count and sum f = {} for k, v in result.items(): _sum = 0 for x, y in v.items(): _sum = _sum + y f[k] = {"similar": v, "count": len(v), "sum": _sum} result = f if output == "json": result = dump_json(result) return result
def drop(self): m_z_col_name = name_col(self.col_name, "modified_z_score") df = self._m_z_score() return df.rows.drop(F.col(m_z_col_name) > self.threshold).cols.drop(m_z_col_name)
def levenshtein_json(df, input_col): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) # df.table() fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join result = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1)).distinct() df = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # if Optimus.cache: # df = df.cache() # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)).cols.rename( temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2))) kv_dict = {} for row in result.collect(): _row = list(row.asDict().values()) kv_dict[_row[1]] = _row[0] kv_result_df = {} for row in df.collect(): _row = list(row.asDict().values()) kv_result_df[_row[0]] = _row[1] result = {} for k, v in kv_result_df.items(): a = result[kv_dict[k]] = [] for iv in v: a.append(kv_dict[iv]) return result