def dedup_min_hash(df, column, id_col, min_distance=0.1): """ Deduplicates a dataset using MinHash on a token count basis. Removes all items with a distance smaller than min_distance. """ @udf("long") def num_nonzeros(v): return v.numNonzeros() df.cache() tokenizer = RegexTokenizer(inputCol=column, outputCol="tokens") tokens = tokenizer.transform(df) cv = CountVectorizer(inputCol="tokens", outputCol="token_ids") vectorizer_model = cv.fit(tokens) with_token_ids = vectorizer_model.transform(tokens).drop("tokens", column) with_token_ids = with_token_ids.where( num_nonzeros(with_token_ids.token_ids) > 0).cache() mh = MinHashLSH(inputCol="token_ids", outputCol="hashes", seed=1, numHashTables=10) dedup_model = mh.fit(with_token_ids) joined = dedup_model.approxSimilarityJoin(with_token_ids, with_token_ids, 1 - min_distance, distCol="dist")\ .drop("token_ids", "hashes")\ .filter(f"datasetA.{id_col} < datasetB.{id_col}") duplicate_ids = joined.rdd.flatMap(lambda row: (row.datasetA[id_col], row.datasetB[id_col]))\ .distinct()\ .map(lambda el: [el])\ .toDF() return df.join(duplicate_ids, duplicate_ids._1 == df[id_col], "left")\ .where(duplicate_ids._1.isNotNull())\ .drop(duplicate_ids._1)
def run_minhash_lsh(): df = util.read_all_json_from_bucket(sql_context, config.S3_BUCKET_BATCH_PREPROCESSED) mh = MinHashLSH(inputCol="text_body_vectorized", outputCol="min_hash", numHashTables=config.LSH_NUM_BANDS) # Vectorize so we can fit to MinHashLSH model htf = HashingTF(inputCol="text_body_stemmed", outputCol="raw_features", numFeatures=1000) htf_df = htf.transform(df) vectorizer = VectorAssembler(inputCols=["raw_features"], outputCol="text_body_vectorized") vdf = vectorizer.transform(htf_df) if (config.LOG_DEBUG): print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green")) model = mh.fit(vdf) # Compute pairwise LSH similarities for questions within tags if (config.LOG_DEBUG): print( colored( "[BATCH]: Fetching questions in same tag, comparing LSH and MinHash, uploading duplicate candidates back to Redis...", "cyan")) find_dup_cands_within_tags(model)
def process_df(df): time_seq.append(['start process-df', time.time()]) model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="instruments", outputCol="instruments_tokenized", minTokenLength=1), NGram(n=1, inputCol="instruments_tokenized", outputCol="instruments_ngrams"), HashingTF(inputCol="instruments_ngrams", outputCol="instruments_vectors"), MinHashLSH(inputCol="instruments_vectors", outputCol="instruments_lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \ .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \ .select(f.col('datasetA.filename').alias('filename_A'), f.col('datasetB.filename').alias('filename_B'), f.col('distance')) time_seq.append(['process-df df_matches', time.time()]) write_df_to_pgsql(df_matches, 'filepair_similarity_run3') time_seq.append(['write pgsql', time.time()]) print('time_seq', time_seq)
def similarity_matrix(self, rdd1, rdd2): minhash = MinHashLSH(inputCol='vectors', outputCol='LSH') model = minhash.fit(rdd1) rdd1 = model.transform(rdd1) rdd2 = model.transform(rdd2) output = model.approxSimilarityJoin(rdd1,rdd2, threshold = 0.8).filter(col('distCol') > 0) \ .select(col('datasetB.pid').alias('sku_id_seller'), col('datasetA.pid').alias('pid'),col('distCol').alias('similarity_score')) print(output) return output #dataframe contains five columns id_seller(seller products ID), id(our database_products id), product_seller, product_database, similarity_score
def main(): input_dataset = sys.argv[1] output_dir = sys.argv[2] start_time = time.time() #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text') stackoverflow_df = sqlContext.read.csv(input_dataset, header=True).toDF('id', 'text') # stackoverflow_df.show() # stackoverflow_df.head(10).show() # stack_df = stack_rdd.toDF(['id','text']) # stackoverflow_df.show() # stackoverflow_df.printSchema() model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="text", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH( inputCol="vectors", outputCol="lsh" ) #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5) ]).fit(stackoverflow_df) db_hashed = model.transform(stackoverflow_df) # db_hashed.show() # query_hashed = model.transform(query) # db_hashed.show() # query_hashed.show() #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id") res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.70).filter("distCol > 0") #print res #print res.count() res.show() elapsed_time = time.time() - start_time print 'Elapsed Time ==> ', elapsed_time
def training(self, spark, df, df_events): logging.warning(" MinHash (Model) called - ") ############################################### ## Locality Sensitive Hashing Model (Training) ############################################### try: mh = MinHashLSH(inputCol="scaled_features", \ outputCol="hashes", \ numHashTables=3) model = mh.fit(df) #Cache the transformed columns #df3_t = model.transform(df3).cache() df.registerTempTable("df_tbl") df_events.registerTempTable("df_events_tbl") df_events_new = spark.sql(''' select d.*,e.score from df_tbl d, df_events_tbl e where 1=1 and e.item_id=d._id ''') ''' from pyspark.sql.functions import broadcast df_events_new = broadcast(spark.table("df_tbl")).join(spark.table("df_events_tbl"), "_id") ''' df_events_t = model.transform(df_events_new) df_final=model.approxSimilarityJoin(df, \ df_events_t, \ P_MODEL_THRESHOLD, \ distCol="JaccardDistance")\ .selectExpr("datasetA._id as id1", \ "datasetB._id as id2", \ "JaccardDistance as similarity_score", \ "datasetB.score as popularity_score") #.filter("datasetA._id != datasetB._id")\ except Exception as e: print("Error in model training logic - " + str(e)) raise e logging.warning(" MinHash (Model) finished... returning - ") return df_final
def run_minhash_lsh(): df = util.read_all_json_from_bucket(sql_context, config.S3_BUCKET_BATCH_PREPROCESSED) mh = MinHashLSH(inputCol="text_body_vectorized", outputCol="min_hash", numHashTables=config.LSH_NUM_BANDS) # Vectorize so we can fit to MinHashLSH model htf = HashingTF(inputCol="text_body_stemmed", outputCol="raw_features", numFeatures=1000) htf_df = htf.transform(df) vectorizer = VectorAssembler(inputCols=["raw_features"], outputCol="text_body_vectorized") vdf = vectorizer.transform(htf_df) if (config.LOG_DEBUG): print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green")) model = mh.fit(vdf) model.transform(vdf).show() # Approximate similarity join between pairwise elements find_tag = udf(lambda x, y: util.common_tag(x, y), StringType()) if (config.LOG_DEBUG): print( colored("[MLLIB BATCH]: Computing approximate similarity join...", "green")) sim_join = model.approxSimilarityJoin( vdf, vdf, config.DUP_QUESTION_MIN_HASH_THRESHOLD, distCol="jaccard_sim").select( col("datasetA.id").alias("q1_id"), col("datasetB.id").alias("q2_id"), col("datasetA.title").alias("q1_title"), col("datasetB.title").alias("q2_title"), col("datasetA.text_body_vectorized").alias("q1_text_body"), col("datasetB.text_body_vectorized").alias("q2_text_body"), find_tag("datasetA.tags", "datasetB.tags").alias("tag"), col("jaccard_sim")) # Upload LSH similarities to Redis sim_join.foreachPartition(store_spark_mllib_sim_redis)
def main(): potential_clones = sys.argv[1] outDir = sys.argv[2] start_time = time.time() potential_clones = '../Datasource/pc.xml' output_csv = 'csvCodes.csv' df = convertAndSaveAsCSV(potential_clones, output_csv, True) # spark context sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df) transformed_spark_df = spark_df.rdd.map(distributedSourceTransform) pysparkdf_transformedClones = transformed_spark_df.toDF( ['filepath', 'startline', 'endline', 'source']) #pysparkdf_transformedClones.show() model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="source", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors", numFeatures=262144), MinHashLSH( inputCol="vectors", outputCol="lsh", numHashTables=105 ) #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5) ]).fit(pysparkdf_transformedClones) hashed_clones = model.transform(pysparkdf_transformedClones) clone_pairs = model.stages[-1].approxSimilarityJoin( hashed_clones, hashed_clones, 0.70).filter("distCol > 0") clone_pairs.show() elapsed_time = time.time() - start_time print 'Elapsed Time ==> ', elapsed_time
def match_names(df_1, df_2): pipeline = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="name", outputCol="tokens", minTokenLength=1 ), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh") ]) model = pipeline.fit(df_1) stored_hashed = model.transform(df_1) landed_hashed = model.transform(df_2) matched_df = model.stages[-1].approxSimilarityJoin(stored_hashed, landed_hashed, 1.0, "confidence").select( col("datasetA.name"), col("datasetB.name"), col("confidence")) matched_df.show(20, False)
def test_min_hash_lsh(self): data = self.spark.createDataFrame([( 0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]), ), ( 1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]), ), ( 2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]), )], ["id", "features"]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml MinHashLSH', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(2)) data_np = data.limit(2).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().hashes.apply(lambda x: pandas.Series(x).map( lambda y: y.values[0])).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMinHashLSH") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['hashes'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def vectorizeDF(raw): raw = spark.createDataFrame(raw_groups, schema=['data', 'target']) raw = raw.withColumn('id', monotonically_increasing_id()) tokenizer = Tokenizer(inputCol='data', outputCol='tokens') swremover = StopWordsRemover(inputCol='tokens', outputCol='words') cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=100) mh = MinHashLSH(inputCol='features', outputCol='hashes', numHashTables=NUM_HASH_TABLES, seed=5123) pipeline = Pipeline(stages=[tokenizer, swremover, cv, mh]) feat_data = pipeline.fit(dataset=raw).transform(raw) checkZero = udf(lambda V: V.numNonzeros() > 0, BooleanType()) feat_data = feat_data.filter(checkZero(col('features'))) return feat_data
def jaccard_cross_join( input_col: str, output_col: str, df: DataFrame, primary_df: DataFrame, secondary_df: DataFrame, ): """Fit a jaccard index model based on all the docs in the corpus. Then take a subset of these (the primary docs) and cross join with a different subset (the secondary docs) to find any docs that are similar according to the minimum similarity specified.""" hash_col = "hashes" min_hash_lsh = MinHashLSH(inputCol=input_col, outputCol=hash_col, seed=12345, numHashTables=3) model = min_hash_lsh.fit(primary_df) return model.approxSimilarityJoin(primary_df, secondary_df, distCol=output_col, threshold=1.0)
conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) spDF = sqlContext.createDataFrame(df4) X=spark.createDataFrame(df4['description_x']., "string").toDF("text") X_parallelize== sc.parallelize(X) Y=spark.createDataFrame(df4['description_y']., "string").toDF("text") Y_parallelize== sc.parallelize(Y) model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="text", outputCol="tokens", minTokenLength=1 ), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh") ]).fit(db) db_hashed = model.transform(db) query_hashed = model.transform(query) model.stages[-1].approxSimilarityJoin(db_hashed, query_hashed, 0.75).show()
dHash_dict=img_hash.map(lambda url_dHash: (url_dHash[1], url_dHash[0])) ### python 3 code #dHash_dict.take(5).foreach(println) #Pickles python hash dictionary hs.pickleHash(dHash_dict.collectAsMap()) #Converts Image dHash into Sparse Vector (Required Input for LSH) img_sparse=img_hash.map(lambda img: (img[0], str(img[1]), hs.sparse_vectorize(img[1]))) #Converts array of sparse img vectors into dataframe df = spark.createDataFrame(img_sparse, ["url", "dHash", "sparseHash"]) #MinHashLSH mh = MinHashLSH(inputCol="sparseHash", outputCol="minHash", numHashTables=4, seed=69) model = mh.fit(df) #BucketedRandomProjectionLSH #brp = BucketedRandomProjectionLSH(inputCol="sparseHash", outputCol="minHash", bucketLength=20.0, numHashTables=5) #model = brp.fit(df) #KMeans #kmeans=KMeans(featuresCol='denseHash', predictionCol='minHash', k=12, seed=69) #model = kmeans.fit(df) #Transform df to model transformed_df = model.transform(df).select("url","dHash","minHash") #Combines LSH_minHash Arrays into List dense_to_array_udf = F.udf(hs.dense_to_array, T.ArrayType(T.FloatType()))
def get_similar_word(self, column, text, n_words=10, n_hash=5, verbose=True): """ Get similar strings in a column by MinHash target_col: target column to search text: input string n_words: number of similar strings n_hash: number of hash functions for MinHash verbose:True if you want to see interactive output Output: DataFrame of Nearest Neighbours """ rdd = self.data.rdd rdd = rdd.filter(lambda row: row[column] != None) rdd = rdd.filter(lambda row: row[column] != "") rdd = rdd.filter(lambda row: len(row[column]) > 1) cdf = self.ss.createDataFrame( rdd.map(lambda row: (row[column] if row[column] != None else " ", list(row[column].lower()) if row[column] != None else [" "]))) ngram = NGram(n=2, inputCol="_2", outputCol="ngrams") if verbose: print("Counting Ngram...") ngramDataFrame = ngram.transform(cdf) if verbose: print("Vectorizing...") # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=3000, minDF=0) cv_model = cv.fit(ngramDataFrame) result = cv_model.transform(ngramDataFrame) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=n_hash) if verbose: print("Min Hashing...") model = mh.fit(result) input_text = text input_df = [{'text': input_text, 'characters': list(input_text)}] input_df = self.ss.createDataFrame(input_df) ngram = NGram(n=2, inputCol="characters", outputCol="ngrams") input_df = ngram.transform(input_df) key = cv_model.transform(input_df).first()['features'] if (key.toArray().sum() < 1): print("No Match! Please try another input..") return if verbose: print("Finding nearest neighbors...") NNs = model.approxNearestNeighbors(result, key, n_words) NNs.show() self.out = NNs #self.out=NNs.select('_1').distinct() return
def main(): sc = SparkSession.builder.appName("SentencingAnalyzer")\ .config("spark.driver.memory", "10G")\ .getOrCreate() # main df cases = sc.read.json("../data/sentencingCases2.jsonl") df = cleanDf(cases) # read categorized csv categorizedCsv = sc.read.csv("../data/categorized.csv", header=True) categorizedCsv = categorizedCsv.select( 'caseName', f.split(f.col("type"), " - ").alias('offenseType'), 'duration1', 'sentenceType1') # create the search df df = extractOffenseKeywords(df) df.cache() dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"]) # CLASSIFICATION OF OFFENSE hashingTF = HashingTF(inputCol="offenseKeywords", outputCol="rawFeatures", numFeatures=1000) result = hashingTF.transform(df) resultSearch = hashingTF.transform(dfSearch) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(result) rescaledData = idfModel.transform(result).filter( f.size('offenseKeywords') > 0) idfModelSearch = idf.fit(resultSearch) rescaledDataSearch = idfModelSearch.transform(resultSearch) mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345, numHashTables=20) modelMH = mh.fit(rescaledData) transformedData = modelMH.transform(rescaledData) modelMHSearch = mh.fit(rescaledDataSearch) transformedDataSearch = modelMH.transform(rescaledDataSearch) categorizedDf = modelMHSearch.approxSimilarityJoin( transformedDataSearch, transformedData, 0.89, distCol="JaccardDistance") distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \ .orderBy('caseID', 'JaccardDistance') distanceDf = distanceDf.groupBy('caseID').agg( f.collect_list('term').alias('predictedOffences'), f.collect_list('JaccardDistance').alias('JaccardDistances')) distanceDf.cache() distanceDf.show() # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION distanceDfEval = distanceDf.join( categorizedCsv, distanceDf.caseID == categorizedCsv.caseName) distanceDfEval = distanceDfEval.filter( distanceDfEval.offenseType[0] != "N/A").filter( distanceDfEval.offenseType[0] != "multiple party sentence") calcuateDifferenceInPredictedVsActualOffences_udf = f.udf( calcuateDifferenceInPredictedVsActualOffences, FloatType()) distanceDfEval = distanceDfEval.withColumn( "error", calcuateDifferenceInPredictedVsActualOffences_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf( calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType()) distanceDfEval = distanceDfEval.withColumn( "pctCorrect", calcuateDifferenceInPredictedVsActualOffencesPercentage_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) distanceDfEval.select('caseID', 'predictedOffences', 'offenseType', 'JaccardDistances', 'error', 'pctCorrect').show(200, truncate=False) rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] / distanceDfEval.count())**(1.0 / 2) print("Offense category RMSE:", rmse) pctCorrectOffense = (distanceDfEval.groupBy().agg( f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100 print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
dist_slangs = slangs_.selectExpr( "slangs").dropna().dropDuplicates().withColumn( "id", monotonically_increasing_id()).withColumn( 'slangs_lower', lower_tokens_udf(col('slangs'))) #define model pipeline #regex tokenizer to split into characters #featurize characters - index them #perform MinHashLSH - jaccardian similarity model = Pipeline(stages=[ RegexTokenizer(pattern="", inputCol="slangs_lower", outputCol="tokens", minTokenLength=1), CountVectorizer(inputCol="tokens", outputCol="features"), MinHashLSH(inputCol="features", outputCol="hashValues", numHashTables=20) ]).fit(dist_slangs) #actually perform the transformation dist_slangs_hashed = model.transform(dist_slangs) #perform similarity join; threshold set at 85% similarity, 15% refers to distance away from perfect match self_join = model.stages[-1].approxSimilarityJoin(dist_slangs_hashed, dist_slangs_hashed, 0.15, distCol="JaccardDistance")\ .select(col("datasetA.slangs").alias("slangsA"), col("datasetB.slangs").alias("slangsB"), col("JaccardDistance")) #include levenshtein distance based on fuzzy distance, threshold set at above 80% similarity self_join = self_join.withColumn( 'LeviDistance', levenshtein(col('slangsA'),col('slangsB'))).withColumn( 'FuzzyDistance', fuzzy_wuzzy_udf(col('slangsA'),col('slangsB')))\ .where(col('FuzzyDistance')>85)
df = adding_titles(df) df = drop_values(df) df.show() df.cache() from pyspark.ml import Pipeline from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH import pyspark.sql.functions as f model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="title", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.9) #show all matches (including duplicates) df_matches.select( f.col('datasetA.id').alias('id_A'), f.col('datasetB.id').alias('id_B'), f.col('distCol')).show() #show non-duplicate matches df_matches.select( f.col('datasetA.id').alias('id_A'), f.col('datasetB.id').alias('id_B'),
def jaccard_with_min_hashing(df_t_user, to_compare, regarding, mode="dist", minval=0.0, maxval=1.0): df_t_user = df_t_user.distinct() #get regarding df_regarding = df_t_user.select(col(regarding)).distinct() print("regarding", df_regarding.count()) if df_regarding == None or df_regarding.rdd.isEmpty(): return None #create ids for each regarding element print("Creating ids") windowSpec = W.orderBy(regarding) df_regarding = df_regarding.withColumn("id", f.row_number().over(windowSpec)) df_regarding.groupBy("id").count().orderBy(desc("count")).show() #window function moved df_titles to single partition --> repartition df_regarding.repartition(200) df_regarding.show() #join dataframes to get author/id pairs print("Joining...") df1 = df_t_user.alias("df1") df2 = df_regarding.alias("df2") df_joined = df1.join(df2, col('df1.' + regarding) == col('df2.' + regarding)).select( col('df1.' + to_compare).alias(to_compare), col('df2.id').alias("id")) df_joined.show() print("Join Complete") #create binary vectors print("Creating vectors") count = df_regarding.count() + 10 tmp = df_regarding.select(col("id")).orderBy(desc("id")).first() print("max_id", tmp["id"]) if tmp != None: max_index = int(tmp["id"]) + 10 else: max_index = 0 size = max(count, max_index) #df_joined = df_joined.rdd.map(lambda r: (r[to_compare], float(r['id']))).groupByKey().map(lambda r: sparse_vec(r, size)).toDF() df_joined = df_joined.groupBy(to_compare).agg( collect_set("id")).rdd.map(lambda r: sparse_vec(r, size)).toDF() print("df_joined", df_joined.count()) df_res = df_joined.select( col('_1').alias(to_compare), col('_2').alias('features')) df_res.show() df_res = df_res.repartition(200) #df_res.cache() print("df_res", df_res.count()) print("Creating model") mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=100) model = mh.fit(df_res) model.transform(df_res).show() print("Calculating Jaccard") df_jacc_dist = model.approxSimilarityJoin(df_res, df_res, 1.0, distCol="jaccard") df_jacc_dist.cache() df_jacc_dist.show() print("Selecting needed columns") df_filtered = df_jacc_dist.select( col("datasetA." + to_compare).alias(to_compare + "1"), col("datasetB." + to_compare).alias(to_compare + "2"), col("jaccard")) df_filtered.show() df_filtered = df_filtered.where( col(to_compare + "1") < col(to_compare + "2")) df_filtered.show() #hier irgendwo Problem df_needed = df_filtered.where((col("jaccard") >= minval) & (col("jaccard") <= maxval)) df_needed.show() if mode == "sim": df_needed = df_needed.withColumn("jaccard", 1.0 - col("jaccard")) return df_needed
dataB = [( 3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]), ), ( 4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]), ), ( 5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]), )] dfB = spark.createDataFrame(dataB, ["id", "features"]) key = Vectors.sparse(6, [1, 3], [1.0, 1.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(dfA) # Feature Transformation print( "The hashed dataset where hashed values are stored in the column 'hashes':" ) model.transform(dfA).show() # Compute the locality sensitive hashes for the input rows, then perform approximate # similarity join. # We could avoid computing hashes by passing in the already-transformed dataset, e.g. # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)` print("Approximately joining dfA and dfB on distance smaller than 0.6:") model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\ .select(col("datasetA.id").alias("idA"),
line = fp.readline().split(" ") cnt += 1 size = len(list(shingles)) cnt = 0 for key, value in tqdm(matrix.items()): aux = [] for index, sh in value.items(): aux.append(sh) data.append( (key, Vectors.sparse(size, sorted(list(aux)), np.ones(len(list(aux)))))) next_prime = sieve_of_eratosthenes(size * 2, size) sc = spark.sparkContext distData = sc.parallelize(data) #df = spark.createDataFrame(data, ["id", "features"]) df = spark.createDataFrame(distData, ["id", "features"]) key = Vectors.dense([1.0, 0.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5, seed=next_prime) model = mh.fit(df) dft = model.transform(df) model.approxSimilarityJoin(dft, dft, 0.6, distCol="JaccardDistance").select( col("datasetA.id").alias("idA"), col("datasetB.id").alias("idB"), col("JaccardDistance")).filter("idA != idB").show()
ng = NGram(n=2, inputCol="words", outputCol="ngrams") dataset = ng.transform(dataset) dataset.show() #[8] #fitting the model to our dataset like we do in unsupervised learning cvect = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=100000, minDF=2) model = cvect.fit(dataset) dataset = model.transform(dataset) #[9] #LSH class for Jaccard distance. minhash = MinHashLSH(inputCol="features", outputCol="hashValues", seed=12345).setNumHashTables(3) model = minhash.fit(dataset) model.transform(dataset) #[10] #Printing Values print("Total no. of Files: ", dataset.count()) print("Column Data: ", dataset.dtypes) dataset.show() #[11] matrix = model.approxSimilarityJoin(dataset, dataset, 3.0).select( col("datasetA.title").alias("A"), col("datasetB.title").alias("B"), col("distCol")).sort(desc("distCol")).dropDuplicates(['distCol'])
col("sponsoring_country").alias("country"), concat_string_arrays("hashtags", "urls", "related_tweetids").alias("combined")) dfTrainRelatedUsers = dfTrain.select(col("userid"), col("sponsoring_country").alias("country"), col("related_userids")) dfTestTweets = dfTest.select(col("userid"), concat_string_arrays("hashtags", "urls", "related_tweetids").alias("combined")) dfTestRelatedUsers = dfTest.select(col("userid"), col("related_userids")) model = Pipeline(stages=[ HashingTF(inputCol="combined", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh")]).fit(dfTrainTweets) trainTweetsHashed = model.transform(dfTrainTweets) testTweetsHashed = model.transform(dfTestTweets) combined = model.stages[-1].approxSimilarityJoin(trainTweetsHashed, testTweetsHashed, 0.9) combined.write.parquet('combined_hashed.parquet') model2 = Pipeline(stages=[ HashingTF(inputCol="related_userids", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh")]).fit(dfTrainRelatedUsers) trainUsersHashed = model2.transform(dfTrainRelatedUsers) testUsersHashed = model2.transform(dfTestRelatedUsers)