Python name_col示例，optimus.helpers.columns.name_col Python示例

示例#1

0

显示文件

def n_gram_fingerprint(df, input_cols, n_size=2):
    """
    Calculate the ngram for a fingerprinted string
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """
    def remote_white_spaces_remove_sort_join(value, args):
        # remove white spaces
        value = [x.replace(" ", "") for x in value]

        # sort and remove duplicated
        value = sorted(set(value))

        # join the tokens back together
        value = "".join(value)

        return value

    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        ngram_col = name_col(input_col, NGRAM_COL)
        ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        df = (
            df.cols.copy(input_col, name_col(
                input_col,
                NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces(
                    ngram_col).cols.remove_special_chars(
                        ngram_col).cols.remove_accents(ngram_col)
            # For create n-grams we need an Array type column
            .cols.nest(input_cols=ngram_col,
                       output_col=ngram_col,
                       shape='array'))
        if Optimus.cache:
            df = df.cache()

        n_gram = NGram(n=n_size,
                       inputCol=ngram_col,
                       outputCol=ngram_fingerprint_col)
        df = n_gram.transform(df)
        df = df.cols.apply(ngram_fingerprint_col,
                           remote_white_spaces_remove_sort_join, "string")

    return df

示例#2

0

显示文件

    def info(self):
        self.tmp_col = name_col(self.col_name, "z_score")

        df = self.z_score()
        max_z_score = df.rows.select(F.col(self.tmp_col) > self.threshold).cols.max(self.tmp_col)

        return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
                "max_z_score": max_z_score}

示例#3

0

显示文件

    def info(self):
        m_z_col_name = name_col(self.col_name, "modified_z_score")

        df = self._m_z_score()
        max_m_z_score = df.rows.select(F.col(m_z_col_name) > self.threshold).cols.max(m_z_col_name)

        return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
                "max_m_z_score": max_m_z_score}

示例#4

0

显示文件

    def drop(self):
        col_name = self.col_name
        z_col_name = name_col(col_name, "z_score")
        threshold = self.threshold

        return self.df.cols.z_score(col_name, z_col_name) \
            .rows.drop(F.col(z_col_name) > threshold) \
            .cols.drop(z_col_name)

示例#5

0

显示文件

    def _m_z_score(self):
        df = self.df
        col_name = self.col_name

        mad = df.cols.mad(col_name, self.relative_error, True)
        m_z_col_name = name_col(col_name, "modified_z_score")

        return df.withColumn(m_z_col_name, F.abs(0.6745 * (F.col(col_name) - mad["median"]) / mad["mad"]))

示例#6

0

显示文件

文件： extension.py 项目： reddyreddys255/Optimus

def correlation(self, input_cols, method="pearson", output="json"):
    """
    Calculate the correlation between columns. It will try to cast a column to float where necessary and impute
    missing values
    :param self:
    :param input_cols: Columns to be processed
    :param method: Method used to calculate the correlation
    :param output: array or json
    :return:
    """

    df = self

    # Values in columns can not be null. Warn user
    input_cols = parse_columns(self,
                               input_cols,
                               filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
    # try to parse the select column to float and create a vector

    # print(self.cols.count_na(input_cols))

    # Input is not a vector transform to a vector
    output_col = name_col(input_cols, "correlation")
    if len(input_cols) > 1:
        for col_name in input_cols:
            df = df.cols.cast(col_name, "float")
            logger.print(
                "Casting {col_name} to float...".format(col_name=col_name))

        df = df.cols.nest(input_cols, "vector", output_cols=output_col)

    corr = Correlation.corr(df, output_col, method).head()[0].toArray()

    if output is "array":
        result = corr

    elif output is "json":

        # Parse result to json
        col_pair = []
        for col_name in input_cols:
            for col_name_2 in input_cols:
                col_pair.append({"between": col_name, "an": col_name_2})

        # flat array
        values = corr.flatten('F').tolist()

        result = []
        for n, v in zip(col_pair, values):
            # Remove correlation between the same column
            if n["between"] is not n["an"]:
                n["value"] = v
                result.append(n)

        result = sorted(result, key=lambda k: k['value'], reverse=True)

    return {"cols": input_cols, "data": result}

示例#7

0

显示文件

    def info(self):
        col_name = self.col_name
        z_col_name = name_col(col_name, "z_score")

        max_z_score = self.df.cols.z_score(col_name, z_col_name) \
            .cols.max(z_col_name)

        return {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "max_z_score": max_z_score
        }

示例#8

0

显示文件

    def info(self, output="dict"):
        self.tmp_col = name_col(self.col_name, "z_score")

        # df = self.z_score()
        df = self.df
        max_z_score = df.cols.max()

        return {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "max_z_score": max_z_score
        }

示例#9

0

显示文件

文件： abstract_outliers_threshold.py 项目： zhangyeejia/Optimus

    def __init__(self, df, col_name, prefix):
        """

        :param df: Spark Dataframe
        :param col_name: column name
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        self.df = df
        self.col_name = one_list_to_val(parse_columns(df, col_name))
        self.tmp_col = name_col(self.col_name, prefix)

示例#10

0

显示文件

    def info(self, output: str = "dict"):
        m_z_col_name = name_col(self.col_name, "modified_z_score")

        df = self.df
        z_score = self.z_score

        max_m_z_score = df.rows.select(z_score > self.threshold).cols.max()
        #
        return {
            "count_outliers": self.count(),
            "count_non_outliers": self.non_outliers_count(),
            "max_m_z_score": format_dict(max_m_z_score)
        }

示例#11

0

显示文件

def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param input_col:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(
        F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(
        F.collect_list(input_col).alias(cluster_col),
        F.size(F.collect_list(input_col)).alias(cluster_size_col),
        F.first(input_col).alias(recommended_col),
        F.sum("count").alias(count_col)).repartition(1)
    if Optimus.cache:
        df_t = df_t.cache()

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    if Optimus.cache:
        df_l = df_l.cache()

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"])

    return df_l

示例#12

0

显示文件

def levenshtein_matrix(df, input_col):
    """
    Create a couple of column with all the string combination
    :param df:
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)

    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(fingerprint_col).distinct().select(F.col(fingerprint_col).alias(temp_col_1),
                                                      F.col(fingerprint_col).alias(temp_col_2))

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    return df

示例#13

0

显示文件

def base_clustering_function(df, input_cols, output, func=None, args=None):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :return:
    """

    # df = self.df
    input_cols = parse_columns(df, input_cols)
    result = {}
    for input_col in input_cols:
        output_col = name_col(input_col, FINGERPRINT_COL)
        count_col = name_col(input_col, COUNT_COL)
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (df.groupBy(input_col).agg(
            F.count(input_col).alias(count_col)).select(
                count_col, input_col)).h_repartition(1)

        # Calculate the fingerprint
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        cluster_sum_col = name_col(input_col, CLUSTER_SUM_COL)
        cluster_count_col = name_col(input_col, CLUSTER_COUNT_COL)

        # Apply function cluster
        df = func(df, *args)

        df = df.withColumn(cluster_col, F.create_map([input_col, count_col]))

        df = df.groupBy(output_col).agg(
            F.max(F.struct(F.col(count_col),
                           F.col(input_col))).alias(recommended_col),
            F.collect_list(cluster_col).alias(cluster_col),
            F.count(output_col).alias(cluster_count_col),
            F.sum(count_col).alias(cluster_sum_col))
        df = df.select(recommended_col + "." + input_col, cluster_col,
                       cluster_count_col, cluster_sum_col)

        for row in df.collect():
            _row = list(row.asDict().values())
            # List of dict to dict
            flatted_dict = {
                k: v
                for element in _row[1] for k, v in element.items()
            }
            result[_row[0]] = {
                "similar": flatted_dict,
                "count": _row[2],
                "sum": _row[3]
            }

    if output == "json":
        result = dump_json(result)
    return result

示例#14

0

显示文件

    def __init__(self, df, col_name, threshold):
        """

        :para df:
        :param col_name:
        :param threshold:
        """
        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        self.df = df
        self.threshold = threshold
        self.col_name = one_list_to_val(parse_columns(df, col_name))
        self.tmp_col = name_col(col_name, "z_score")
        self.z_score = self.df[self.col_name].cols.z_score()
        # print("self.df_score",self.df_score)
        # print("self.df",self.df)
        super().__init__()

示例#15

0

显示文件

文件： keycollision.py 项目： a-domingu/tbcnn

def n_gram_fingerprint(df, input_cols, n_size=2):
    """
    Calculate the ngram for a fingerprinted string
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """

    def calculate_ngrams(value, args):
        # remove white spaces
        ngram = list(ngrams(value, n_size))

        # sort and remove duplicated
        ngram = sorted(set(ngram))

        _result = ""
        for item in ngram:
            for i in item:
                _result = _result + i

        # join the tokens back together
        _result = "".join(_result)

        return _result

    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        ngram_fingerprint_col = name_col(input_col, FINGERPRINT_COL)
        # ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        df = (df
              .cols.copy(input_col, ngram_fingerprint_col)
              .cols.lower(ngram_fingerprint_col)
              .cols.remove_white_spaces(ngram_fingerprint_col)
              .cols.apply(ngram_fingerprint_col, calculate_ngrams, "string", output_cols=ngram_fingerprint_col)
              .cols.remove_special_chars(ngram_fingerprint_col)
              .cols.normalize_chars(ngram_fingerprint_col)

              )

    return df

示例#16

0

显示文件

    def __init__(self, df, col_name, threshold):
        """

        :para df:
        :param col_name:
        :param threshold:
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        self.df = df
        self.threshold = threshold
        self.col_name = one_list_to_val(parse_columns(df, col_name))
        self.tmp_col = name_col(col_name, "z_score")
        self.df_score = self.z_score()
        super().__init__(self.df_score, col_name, "z_score")

示例#17

0

显示文件

def one_hot_encoder(df, input_cols, **kargs):
    """
    Maps a column of label indices to a column of binary vectors, with at most a single one-value.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be encoded.
    :return: Dataframe with encoded columns.
    """

    input_cols = parse_columns(df, input_cols)

    encode = [
        OneHotEncoder(inputCol=column,
                      outputCol=name_col(column, "_encoded"),
                      **kargs) for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=encode)
    df = pipeline.fit(df).transform(df)

    return df

示例#18

0

显示文件

def vector_assembler(df, input_cols, output_col=None):
    """
    Combines a given list of columns into a single vector column.
    :param df: Dataframe to be transformed.
    :param input_cols: Columns to be assembled.
    :param output_col: Column where the output is going to be saved.
    :return: Dataframe with assembled column.
    """

    input_cols = parse_columns(df, input_cols)

    if output_col is None:
        output_col = name_col(input_cols, "vector_assembler")

    assembler = [VectorAssembler(inputCols=input_cols, outputCol=output_col)]

    pipeline = Pipeline(stages=assembler)
    df = pipeline.fit(df).transform(df)

    return df

示例#19

0

显示文件

def levenshtein_filter(df, input_col):
    """
    Get the nearest string
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    # TODO: must filter by and exprs
    func = F.min

    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"
    temp_r = "TEMP_R"

    df = levenshtein_matrix(df, input_col)

    # get the closest word
    df_r = (df.rows.drop(F.col(distance_col) == 0)
            .groupby(temp_col_1)
            .agg(func(distance_col).alias(distance_r_col))
            .cols.rename(temp_col_1, temp_r))

    if Optimus.cache:
        df_r = df_r.cache()

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))).select(
        temp_col_1,
        distance_col,
        temp_col_2)

    if Optimus.cache:
        df = df.cache()

    df = df \
        .cols.rename([(temp_col_1, input_col + "_FROM"), (temp_col_2, input_col + "_TO")])

    return df

示例#20

0

显示文件

文件： encoding.py 项目： sarikayamehmet/Optimus

def index_to_string(df, input_cols, output_col=None, **kargs):
    """
    Maps a column of indices back to a new column of corresponding string values. The index-string mapping is
    either from the ML attributes of the input column, or from user-supplied labels (which take precedence over
    ML attributes).
    :param df: Dataframe to be transformed.
    :param input_cols: Columns to be indexed.
    :param output_col: Column where the output is going to be saved.
    :return: Dataframe with indexed columns.
    """

    input_cols = parse_columns(df, input_cols)
    if output_col is None:
        output_col = name_col(input_cols, "index_to_string")

    indexers = [IndexToString(inputCol=column, outputCol=output_col, **kargs) for column in
                list(set(input_cols))]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)

    return df

示例#21

0

显示文件

def string_to_index(df, input_cols, **kargs):
    """
    Maps a string column of labels to an ML column of label indices. If the input column is
    numeric, we cast it to string and index the string values.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be indexed.
    :param output_cols:
    :return: Dataframe with indexed columns.
    """

    input_cols = parse_columns(df, input_cols)

    indexers = [
        StringIndexer(inputCol=input_col,
                      outputCol=name_col(input_col, "index"),
                      **kargs).fit(df) for input_col in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)

    return df

示例#22

0

显示文件

    def h2o_automl(df, label, columns, **kwargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        automl = H2OAutoML(
            convertUnknownCategoricalLevelsToNa=True,
            maxRuntimeSecs=60,  # 1 minutes
            seed=1,
            maxModels=3,
            labelCol=name_col(label, "index_to_string"),
            **kwargs)

        model = automl.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn(
            "prediction",
            when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model

示例#23

0

显示文件

def normalizer(df, input_cols, output_col=None, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param output_col: Column where the output is going to be saved.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datat ype:
    if not is_(input_cols, (str, list)):
        RaiseIt.type_error(input_cols, ["str", "list"])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, (float, int)):
        RaiseIt.type_error(input_cols, ["float", "int"])

    # Try to create a vector
    if len(input_cols) > 1:
        df = df.cols.cast(input_cols, "vector")

    if output_col is None:
        output_col = name_col(input_cols, "normalizer")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name, outputCol=output_col, p=p)
        for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df

示例#24

0

显示文件

文件： models.py 项目： arpit1997/Optimus

    def random_forest(df, columns, input_col, **kargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = RandomForestClassifier(**kargs)
        df = df.cols.rename(name_col(input_col, "index"), "label")

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model

示例#25

0

显示文件

文件： distancecluster.py 项目： zhangyeejia/Optimus

def levenshtein_cluster(df,
                        input_col,
                        threshold: int = None,
                        output: str = "dict"):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col: Column to be processed
    :param threshold: number
    :param output: "dict" or "json"
    :return:
    """
    # Create fingerprint
    df_fingerprint = keycollision.fingerprint(df, input_col)

    # Names
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)
    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"
    count = "count"

    # Prepare the columns to calculate the cross join
    fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \
        .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2),
             F.count(input_col).alias(count)) \
        .select(temp_col_1, temp_col_2, count).collect()

    df = df_fingerprint.select(
        input_col,
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    if threshold is None:
        where = ((F.col(distance_col) == 0) &
                 (F.col(temp_col_1) != F.col(temp_col_2)))
    else:
        where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold)

    df_r = (
        df.rows.drop(where).cols.replace(
            distance_col, 0, None,
            search_by="numeric").groupby(temp_col_1).agg(
                F.min(distance_col).alias(distance_r_col))
        # .cols.rename(distance_col, distance_r_col)
        .cols.rename(temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists
    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2),
                                     F.count(temp_col_2)))

    # Replace ngram per string
    kv_dict = {}
    for row in fingerprint_count:
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = {_row[0]: _row[2]}

    result = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        d = {}
        for i in _row[1]:
            key = list(kv_dict[i].keys())[0]
            value = list(kv_dict[i].values())[0]
            d[key] = value
        key = list(kv_dict[_row[0]].keys())[0]
        value = list(kv_dict[_row[0]].values())[0]
        d.update({key: value})
        result[key] = d

    # Calculate count and sum
    f = {}
    for k, v in result.items():
        _sum = 0
        for x, y in v.items():
            _sum = _sum + y
        f[k] = {"similar": v, "count": len(v), "sum": _sum}

    result = f
    if output == "json":
        result = dump_json(result)
    return result

示例#26

0

显示文件

    def drop(self):

        m_z_col_name = name_col(self.col_name, "modified_z_score")
        df = self._m_z_score()
        return df.rows.drop(F.col(m_z_col_name) > self.threshold).cols.drop(m_z_col_name)

示例#27

0

显示文件

def levenshtein_json(df, input_col):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    result = df.select(input_col,
                       F.col(fingerprint_col).alias(temp_col_1)).distinct()

    df = df.select(input_col,
                   F.col(fingerprint_col).alias(temp_col_1),
                   F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # if Optimus.cache:
    #     df = df.cache()

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg(
        F.min(distance_col).alias(distance_r_col)).cols.rename(
            temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists

    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2)))

    kv_dict = {}
    for row in result.collect():
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = _row[0]

    kv_result_df = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        kv_result_df[_row[0]] = _row[1]

    result = {}
    for k, v in kv_result_df.items():
        a = result[kv_dict[k]] = []
        for iv in v:
            a.append(kv_dict[iv])

    return result