dataB = [( 3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]), ), ( 4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]), ), ( 5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]), )] dfB = spark.createDataFrame(dataB, ["id", "features"]) key = Vectors.sparse(6, [1, 3], [1.0, 1.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(dfA) # Feature Transformation print( "The hashed dataset where hashed values are stored in the column 'hashes':" ) model.transform(dfA).show() # Compute the locality sensitive hashes for the input rows, then perform approximate # similarity join. # We could avoid computing hashes by passing in the already-transformed dataset, e.g. # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)` print("Approximately joining dfA and dfB on distance smaller than 0.6:") model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\ .select(col("datasetA.id").alias("idA"), col("datasetB.id").alias("idB"),
# $example on$ dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),), (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),), (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)] dfA = spark.createDataFrame(dataA, ["id", "features"]) dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),), (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),), (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)] dfB = spark.createDataFrame(dataB, ["id", "features"]) key = Vectors.sparse(6, [1, 3], [1.0, 1.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(dfA) # Feature Transformation print("The hashed dataset where hashed values are stored in the column 'hashes':") model.transform(dfA).show() # Compute the locality sensitive hashes for the input rows, then perform approximate # similarity join. # We could avoid computing hashes by passing in the already-transformed dataset, e.g. # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)` print("Approximately joining dfA and dfB on distance smaller than 0.6:") model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\ .select(col("datasetA.id").alias("idA"), col("datasetB.id").alias("idB"), col("JaccardDistance")).show()
def jaccard_with_min_hashing(df_t_user, to_compare, regarding, mode="dist", minval=0.0, maxval=1.0): df_t_user = df_t_user.distinct() #get regarding df_regarding = df_t_user.select(col(regarding)).distinct() print("regarding", df_regarding.count()) if df_regarding == None or df_regarding.rdd.isEmpty(): return None #create ids for each regarding element print("Creating ids") windowSpec = W.orderBy(regarding) df_regarding = df_regarding.withColumn("id", f.row_number().over(windowSpec)) df_regarding.groupBy("id").count().orderBy(desc("count")).show() #window function moved df_titles to single partition --> repartition df_regarding.repartition(200) df_regarding.show() #join dataframes to get author/id pairs print("Joining...") df1 = df_t_user.alias("df1") df2 = df_regarding.alias("df2") df_joined = df1.join(df2, col('df1.' + regarding) == col('df2.' + regarding)).select( col('df1.' + to_compare).alias(to_compare), col('df2.id').alias("id")) df_joined.show() print("Join Complete") #create binary vectors print("Creating vectors") count = df_regarding.count() + 10 tmp = df_regarding.select(col("id")).orderBy(desc("id")).first() print("max_id", tmp["id"]) if tmp != None: max_index = int(tmp["id"]) + 10 else: max_index = 0 size = max(count, max_index) #df_joined = df_joined.rdd.map(lambda r: (r[to_compare], float(r['id']))).groupByKey().map(lambda r: sparse_vec(r, size)).toDF() df_joined = df_joined.groupBy(to_compare).agg( collect_set("id")).rdd.map(lambda r: sparse_vec(r, size)).toDF() print("df_joined", df_joined.count()) df_res = df_joined.select( col('_1').alias(to_compare), col('_2').alias('features')) df_res.show() df_res = df_res.repartition(200) #df_res.cache() print("df_res", df_res.count()) print("Creating model") mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=100) model = mh.fit(df_res) model.transform(df_res).show() print("Calculating Jaccard") df_jacc_dist = model.approxSimilarityJoin(df_res, df_res, 1.0, distCol="jaccard") df_jacc_dist.cache() df_jacc_dist.show() print("Selecting needed columns") df_filtered = df_jacc_dist.select( col("datasetA." + to_compare).alias(to_compare + "1"), col("datasetB." + to_compare).alias(to_compare + "2"), col("jaccard")) df_filtered.show() df_filtered = df_filtered.where( col(to_compare + "1") < col(to_compare + "2")) df_filtered.show() #hier irgendwo Problem df_needed = df_filtered.where((col("jaccard") >= minval) & (col("jaccard") <= maxval)) df_needed.show() if mode == "sim": df_needed = df_needed.withColumn("jaccard", 1.0 - col("jaccard")) return df_needed
def get_similar_word(self, column, text, n_words=10, n_hash=5, verbose=True): """ Get similar strings in a column by MinHash target_col: target column to search text: input string n_words: number of similar strings n_hash: number of hash functions for MinHash verbose:True if you want to see interactive output """ rdd = self.data.rdd rdd = rdd.filter(lambda row: row[column] != None) rdd = rdd.filter(lambda row: row[column] != "") rdd = rdd.filter(lambda row: len(row[column]) > 1) cdf = self.ss.createDataFrame( rdd.map(lambda row: (row[column] if row[column] != None else " ", list(row[column].lower()) if row[column] != None else [" "]))) ngram = NGram(n=2, inputCol="_2", outputCol="ngrams") if verbose: print("Counting Ngram...") ngramDataFrame = ngram.transform(cdf) if verbose: print("Vectorizing...") # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=3000, minDF=0) cv_model = cv.fit(ngramDataFrame) result = cv_model.transform(ngramDataFrame) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=n_hash) if verbose: print("Min Hashing...") model = mh.fit(result) input_text = text input_df = [{'text': input_text, 'characters': list(input_text)}] input_df = self.ss.createDataFrame(input_df) ngram = NGram(n=2, inputCol="characters", outputCol="ngrams") input_df = ngram.transform(input_df) key = cv_model.transform(input_df).first()['features'] if (key.toArray().sum()) < 1: print("No Match! Try another input...") return if verbose: print("Finding nearest neighbors...") NNs = model.approxNearestNeighbors(result, key, n_words) NNs.show() #self.out=NNs.select('_1').distinct() return
dataA.show() Item = Row('id', 'features') Item_seq = [] for index, row in df.iterrows(): print(index) feature = sparseify(users_num, row["user_index"], row["ratings"]) row = Item(row['item_id'], feature) Item_seq.append(row) dataB = spark.createDataFrame(Item_seq) dataB.show() start = time.time() mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(dataB) print( "The hashed dataset where hashed values are stored in the column 'hashes':" ) model.transform(dataB).show() # start experiment ratingdata = pd.read_csv('../users_items_100.csv') ratingdata['playtime_forever'] = round( np.log(ratingdata['playtime_forever'] + 1), 2) y = ratingdata['playtime_forever'] X = ratingdata[['user_id', 'item_index']] print(X.shape) print(y.shape) traindata, testdata = train_test_split(ratingdata, train_size=0.9999)
df2 = model.transform(df1) df2.show() def getsparsesize(v): return v.values.size getsize_udf = udf(getsparsesize, IntegerType()) df2_with_lengths = df2.select("value", "features", getsize_udf("features").alias("vec_size")) df2_with_lengths.show() df2NotNull = df2_with_lengths.filter(getsize_udf(df2["features"]) != 0) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=128) model2 = mh.fit(df2) transformed_df2 = model2.transform(df2NotNull) transformed_df2.show() edges = [] for k in range(0, transformed_df2.count()): edges.append(k) print(edges) def getHashColumns(df0, x): sum_of_hashes = 0 for y in range(x, x + 4): sum_of_hashes += int(df0[y][0]) return sum_of_hashes
def main(): sc = SparkSession.builder.appName("SentencingAnalyzer")\ .config("spark.driver.memory", "10G")\ .getOrCreate() # main df cases = sc.read.json("../data/sentencingCases2.jsonl") df = cleanDf(cases) # read categorized csv categorizedCsv = sc.read.csv("../data/categorized.csv", header=True) categorizedCsv = categorizedCsv.select( 'caseName', f.split(f.col("type"), " - ").alias('offenseType'), 'duration1', 'sentenceType1') # create the search df df = extractOffenseKeywords(df) df.cache() dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"]) # CLASSIFICATION OF OFFENSE hashingTF = HashingTF(inputCol="offenseKeywords", outputCol="rawFeatures", numFeatures=1000) result = hashingTF.transform(df) resultSearch = hashingTF.transform(dfSearch) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(result) rescaledData = idfModel.transform(result).filter( f.size('offenseKeywords') > 0) idfModelSearch = idf.fit(resultSearch) rescaledDataSearch = idfModelSearch.transform(resultSearch) mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345, numHashTables=20) modelMH = mh.fit(rescaledData) transformedData = modelMH.transform(rescaledData) modelMHSearch = mh.fit(rescaledDataSearch) transformedDataSearch = modelMH.transform(rescaledDataSearch) categorizedDf = modelMHSearch.approxSimilarityJoin( transformedDataSearch, transformedData, 0.89, distCol="JaccardDistance") distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \ .orderBy('caseID', 'JaccardDistance') distanceDf = distanceDf.groupBy('caseID').agg( f.collect_list('term').alias('predictedOffences'), f.collect_list('JaccardDistance').alias('JaccardDistances')) distanceDf.cache() distanceDf.show() # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION distanceDfEval = distanceDf.join( categorizedCsv, distanceDf.caseID == categorizedCsv.caseName) distanceDfEval = distanceDfEval.filter( distanceDfEval.offenseType[0] != "N/A").filter( distanceDfEval.offenseType[0] != "multiple party sentence") calcuateDifferenceInPredictedVsActualOffences_udf = f.udf( calcuateDifferenceInPredictedVsActualOffences, FloatType()) distanceDfEval = distanceDfEval.withColumn( "error", calcuateDifferenceInPredictedVsActualOffences_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf( calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType()) distanceDfEval = distanceDfEval.withColumn( "pctCorrect", calcuateDifferenceInPredictedVsActualOffencesPercentage_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) distanceDfEval.select('caseID', 'predictedOffences', 'offenseType', 'JaccardDistances', 'error', 'pctCorrect').show(200, truncate=False) rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] / distanceDfEval.count())**(1.0 / 2) print("Offense category RMSE:", rmse) pctCorrectOffense = (distanceDfEval.groupBy().agg( f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100 print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
df_joined = df1_a.join(df2_a, col('df1_a.author') == col('df2_a.author')).select( 'df1_a.title', 'df2_a.id') df_joined.show(20) # create a binary vector dfWithFeat = df_joined.rdd.map(lambda r: (r['title'], (float(r['id'])))).groupByKey()\ .map(lambda r: sparse_vec(r)).toDF() df_res = dfWithFeat.select( col("_1").alias("title"), col("_2").alias("features")) df_res.show() mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(df_res) # Feature Transformation print( "The hashed dataset where hashed values are stored in the column 'hashes':" ) model.transform(df_res).show() print("Approximately distance smaller than 0.6:") df_jacc_dist = model.approxSimilarityJoin(df_res, df_res, 0.6, distCol="JaccardDistance")\ .select(col("datasetA.title").alias("title"), col("JaccardDistance")).filter("JaccardDistance != 0").orderBy(desc("JaccardDistance")) df_jacc_dist.show() df_hist = df_jacc_dist.select(col("JaccardDistance"))
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=100000) idf = IDF(inputCol="rawFeatures", outputCol="features") #pipeline dos processos declarados para os dados de teste e treino pipeline = Pipeline(stages=[tk, swr, cv, idf]) model_pipe = pipeline.fit(data_treino) data_treino = model_pipe.transform(data_treino) model_pipe = pipeline.fit(data_test) data_test = model_pipe.transform(data_test) #Geracao do modelo e teste mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(data_treino) data_treino = model.transform(data_treino) data_treino.show() #Modelo de dados treinado ''' te = data_test.select("features").collect() tr = data_treino.select("features").collect() ''' data_test.select("features").show() dadosTef = data_test.select("features").rdd.flatMap(lambda x: x).collect() print(" Features dos dados de teste") dadosTr = data_treino.select("NewsGroup", "features").rdd.flatMap(lambda x: x).collect() #model.approxNearestNeighbors(SparseVector(str(tr)),SparseVector(str(te[4])),2),show()