def levenshtein_matrix(df, input_col): """ Create a couple of column with all the string combination :param df: Spark Dataframe :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) # df.table() fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join df = df.select( F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) if Optimus.cache: df = df.cache() return df
def levenshtein_cluster(df, col_name): """ Return a dataframe with a string of cluster related to a string :param df: :param col_name: :return: """ # Prepare a group so we don need to apply the fingerprint to the whole data set df = df.select(col_name).groupby(col_name).agg( F.count(col_name).alias("count")) df = keycollision.fingerprint(df, col_name) df_t = df.groupby(col_name + "_FINGERPRINT").agg( F.collect_list(col_name).alias("cluster"), F.size(F.collect_list(col_name)).alias("cluster_size"), F.first(col_name).alias("recommended"), F.sum("count").alias("count")).repartition(1) # Filter nearest string df_l = levenshtein_filter(df, col_name).repartition(1) # Create Cluster df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \ .cols.drop(col_name + "_FINGERPRINT") \ .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"]) return df_l
def levenshtein_matrix(df, col_name): """ Create a couple of column with all the string combination :param df: :param col_name: :return: """ df = keycollision.fingerprint(df, col_name) col_fingerprint = col_name + "_FINGERPRINT" col_distance = col_name + "_LEVENSHTEIN_DISTANCE" temp_col_1 = col_name + "_LEVENSHTEIN_1" temp_col_2 = col_name + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join df = df.select(col_fingerprint).distinct().select( F.col(col_fingerprint).alias(temp_col_1), F.col(col_fingerprint).alias(temp_col_2)) # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(col_distance, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) return df
def levenshtein_cluster(df, input_col): """ Return a dataframe with a string of cluster related to a string :param df: :param input_col: :return: """ # Prepare a group so we don need to apply the fingerprint to the whole data set df = df.select(input_col).groupby(input_col).agg(F.count(input_col).alias("count")) df = keycollision.fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) fingerprint_col = name_col(input_col, FINGERPRINT_COL) df_t = df.groupby(fingerprint_col).agg(F.collect_list(input_col).alias(cluster_col), F.size(F.collect_list(input_col)).alias(cluster_size_col), F.first(input_col).alias(recommended_col), F.sum("count").alias(count_col)).repartition(1) # Filter nearest string df_l = levenshtein_filter(df, input_col).repartition(1) # Create Cluster df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \ .cols.drop(fingerprint_col) \ .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"]) return df_l
def test_fingerprint(): actual_df =keyCol.fingerprint(source_df,'STATE') expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal')]) assert (expected_df.collect() == actual_df.collect())
def levenshtein_json(df, input_col): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) # df.table() fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join result = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1)).distinct() df = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # if Optimus.cache: # df = df.cache() # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)).cols.rename( temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2))) kv_dict = {} for row in result.collect(): _row = list(row.asDict().values()) kv_dict[_row[1]] = _row[0] kv_result_df = {} for row in df.collect(): _row = list(row.asDict().values()) kv_result_df[_row[0]] = _row[1] result = {} for k, v in kv_result_df.items(): a = result[kv_dict[k]] = [] for iv in v: a.append(kv_dict[iv]) return result
def levenshtein_cluster(df, input_col, threshold: int = None, output: str = "dict"): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: Column to be processed :param threshold: number :param output: "dict" or "json" :return: """ # Create fingerprint df_fingerprint = keycollision.fingerprint(df, input_col) # Names fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" count = "count" # Prepare the columns to calculate the cross join fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \ .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2), F.count(input_col).alias(count)) \ .select(temp_col_1, temp_col_2, count).collect() df = df_fingerprint.select( input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" if threshold is None: where = ((F.col(distance_col) == 0) & (F.col(temp_col_1) != F.col(temp_col_2))) else: where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold) df_r = ( df.rows.drop(where).cols.replace( distance_col, 0, None, search_by="numeric").groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)) # .cols.rename(distance_col, distance_r_col) .cols.rename(temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2), F.count(temp_col_2))) # Replace ngram per string kv_dict = {} for row in fingerprint_count: _row = list(row.asDict().values()) kv_dict[_row[1]] = {_row[0]: _row[2]} result = {} for row in df.collect(): _row = list(row.asDict().values()) d = {} for i in _row[1]: key = list(kv_dict[i].keys())[0] value = list(kv_dict[i].values())[0] d[key] = value key = list(kv_dict[_row[0]].keys())[0] value = list(kv_dict[_row[0]].values())[0] d.update({key: value}) result[key] = d # Calculate count and sum f = {} for k, v in result.items(): _sum = 0 for x, y in v.items(): _sum = _sum + y f[k] = {"similar": v, "count": len(v), "sum": _sum} result = f if output == "json": result = dump_json(result) return result