def calc_jaccard_sim(df_to_process, df_match, thresh=.3, padded=True): if padded: df_processed = df_to_process.join(df_match, (F.size( F.array_intersect( df_to_process.ngrams_pad, df_match.ngrams_pad)) / F.size( F.array_union(df_to_process.ngrams_pad, df_match.ngrams_pad))) > thresh) else: df_processed = df_to_process.join( df_match, (F.size(F.array_intersect(df_to_process.ngrams, df_match.ngrams)) / F.size(F.array_union(df_to_process.ngrams, df_match.ngrams))) > thresh) return df_processed
def silver_airport_data(): df = dlt.read("bronze_airport_data") df = df.replace("\\N", None) df = df.withColumn("nulls", array()) for i, c in enumerate(df.columns): df = df.withColumn( "nulls", when(col(c).isNull(), array_union(col("nulls"), array(lit(i)))).otherwise(col("nulls"))) return df.drop("ingest_timestamp", "ingest_source")
def jaccard_index(primary_col: str, secondary_col: str, output_col: str, df: DataFrame): """Calculate the intersection and union of two array columns""" return df.withColumn( output_col, F.when( F.col(primary_col).isNull() | F.col(secondary_col).isNull(), None). otherwise( F.size(F.array_intersect(F.col(primary_col), F.col(secondary_col))) / F.size(F.array_union(F.col(primary_col), F.col(secondary_col)))), )
def reduce_join(left, right): return_vcf = left.join(right, ["#CHROM", "POS"], "full") ### remove_colname = right.columns[2:] l_name = left.columns r_name = right.columns v_name = return_vcf.columns name_list = ["REF", "ID", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "INFO": return_vcf = return_vcf.withColumn(column_name(l_name, name)[0], when(F.isnull(column_name(l_name, name)[0]), F.col(column_name(r_name, name)[0]))\ .when(F.isnull(column_name(r_name, name)[0]), F.col(column_name(l_name, name)[0])) .otherwise(F.array_union(*column_name(v_name, name)))) else: return_vcf = return_vcf.withColumn(column_name(l_name, name)[0], when(F.isnull(column_name(l_name, name)[0]), F.col(column_name(r_name, name)[0]))\ .when(F.isnull(column_name(r_name, name)[0]), F.col(column_name(l_name, name)[0])) .otherwise(F.array_union(*column_name(v_name, name)))) return_vcf = return_vcf.drop(*remove_colname) return return_vcf
def similaryBasedOnFollowers(data, minFollowers=20, debug=False): # We start by renaming the user column in line with the notation # above. data = data.withColumnRenamed('follows', 'u1') # ==== Step 1 ==== u1_fu1 = data.groupBy('u1').agg(F.collect_set( data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers) if (debug): print('>> Step 1 :: u1 f(u1) <<') u1_fu1.show() # ==== Step 2 ==== # First create a "dual" of data by renaming columns. # This will help the subsequent join. u2_fu2 = u1_fu1.withColumnRenamed('u1', 'u2').withColumnRenamed('fu1', 'fu2') prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2) if (debug): print('>> Step 2 :: u1 f(u1) u2 f(u2) <<') prod.show() # ==== Step 3 ==== prod2 = prod.withColumn('I', F.array_intersect(prod.fu1, prod.fu2)).withColumn( 'U', F.array_union(prod.fu1, prod.fu2)).drop('fu1', 'fu2') if (debug): print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<') #prod2.orderBy('I',ascending=False).show() prod2.show() # ==== Step 4 ==== result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U') if (debug): print('>> Step 4 :: u1 u2 J(u1,u2) <<') result.show() return result
def verification(self, candDF, threshold, key1, key2, keep_cols1, keep_cols2): """ Input: $candDF is the output DataFrame from the 'filtering' function. $threshold is a float value between (0, 1] Output: Return a new DataFrame $resultDF that represents the ER result. It has five columns: id1, joinKey1, id2, joinKey2, jaccard Comments: There are two differences between $candDF and $resultDF (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity between $joinKey1 and $joinKey2 (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold """ return candDF.select( 'id1', 'id2', (size(array_intersect(key1,key2))\ / size(array_union(key1,key2))).alias('jaccard'), # keep certain columns *keep_cols1, *keep_cols2 ).where(col('jaccard') >= threshold)
def generate_metadata_group( experiment_specimen_df: DataFrame, impress_df: DataFrame, exp_type="experiment", ) -> DataFrame: """ Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe, and generates a hash value with the parameters marked as 'isImportant' on IMPReSS. This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions). """ # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value experiment_metadata = experiment_specimen_df.withColumn( "procedureMetadata", explode("procedureMetadata")) # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True impress_df_required = impress_df.where( (col("parameter.isImportant") == True) & (col("parameter.type") == "procedureMetadata")) # Join the experiment DF with he IMPReSS DF experiment_metadata = experiment_metadata.join( impress_df_required, ((experiment_metadata["_pipeline"] == impress_df_required["pipelineKey"]) & (experiment_metadata["_procedureID"] == impress_df_required["procedure.procedureKey"]) & (experiment_metadata["procedureMetadata._parameterID"] == impress_df_required["parameter.parameterKey"])), ) # Create a new column by concatenating the parameter name and the parameter value experiment_metadata = experiment_metadata.withColumn( "metadataItem", when( col("procedureMetadata.value").isNotNull(), concat(col("parameter.name"), lit(" = "), col("procedureMetadata.value")), ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))), ) # Select the right column name for production and phenotyping centre depending on experiment type if exp_type == "experiment": production_centre_col = "_productionCentre" phenotyping_centre_col = "_phenotypingCentre" else: production_centre_col = "production_centre" phenotyping_centre_col = "phenotyping_centre" # Create a window for the DataFrame over experiment id, production and phenotyping centre window = Window.partitionBy( "unique_id", production_centre_col, phenotyping_centre_col).orderBy("parameter.name") # Use the window to create for every experiment an array containing the set of "parameter = value" pairs. experiment_metadata_input = experiment_metadata.withColumn( "metadataItems", collect_set(col("metadataItem")).over(window)) # Add the production centre to the metadata group when this is different form the phenotyping centre. # This is because in that given case we would like to generate a metadata split among specimens # That have been produced and phenotyped on the same centre experiment_metadata_input = experiment_metadata_input.withColumn( "metadataItems", when( (col(production_centre_col).isNotNull()) & (col(production_centre_col) != col(phenotyping_centre_col)), array_union( col("metadataItems"), array( concat(lit("ProductionCenter = "), col(production_centre_col))), ), ).otherwise(col("metadataItems")), ) # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'. experiment_metadata = experiment_metadata_input.groupBy( "unique_id", production_centre_col, phenotyping_centre_col).agg( concat_ws("::", sort_array(max( col("metadataItems")))).alias("metadataGroupList")) # Hash the list to generate a medata group identifier. experiment_metadata = experiment_metadata.withColumn( "metadataGroup", md5(col("metadataGroupList"))) # Select the experiment IDs and the metadata group IDs experiment_metadata = experiment_metadata.select("unique_id", "metadataGroup") # Join the original experiment DataFrame with the result of the metadata group generation experiment_specimen_df = experiment_specimen_df.join( experiment_metadata, "unique_id", "left_outer") # Add the hashed version of an empty string to those rows without a metadata group. experiment_specimen_df = experiment_specimen_df.withColumn( "metadataGroup", when(experiment_specimen_df["metadataGroup"].isNull(), md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]), ) return experiment_specimen_df
functions.array(functions.collect_list('text').over(window)).alias('text'), ) chains.persist() #%% if depth == 0: chains = chains.select( functions.concat('context', 'response').alias('context'), functions.concat('sender', 'author_id').alias('sender'), 'rpos', 'next', functions.col('text').alias('tweets')) else: chains = chains.select( functions.concat_ws(',', 'context', 'response').alias('context'), functions.concat_ws(',', 'sender', 'author_id').alias('sender'), functions.array_union('tweets', 'text'), 'rpos', 'next') #%% chains = chains.join(data, chains.next == data.tweet_id, 'inner')\ .select( 'sender', 'context', 'tweets', 'rpos', data.tweet_id.alias('response'), data.author_id, data.response_tweet_id.alias('next'), data.text.alias('text') ) #%%
def array_union(a: Column, b: Column) -> Column: """Calculate the union of two array columns""" return F.array_remove(F.array_union(a, b), "")
readPlaylistsDF.show() deletePlaylistsDF.unpersist() print("Update playlists") updatePlaylistsDF = df_edit.withColumn( 'Exp_Results', F.explode('update.playlists')).select('Exp_Results.*') updatePlaylistsDF.show(truncate=False) # Only update song ids in the playlists when the user id and playlist id matches the source playlists print("Update playlists Result") updatePlaylistsDF = updatePlaylistsDF.join( readPlaylistsDF, (updatePlaylistsDF.id == readPlaylistsDF.id) & (updatePlaylistsDF.user_id == readPlaylistsDF.user_id), 'inner').select( updatePlaylistsDF.id, updatePlaylistsDF.user_id, F.array_union( F.array_intersect(updatePlaylistsDF.song_ids, F.array([F.lit(x) for x in songs])), readPlaylistsDF.song_ids).alias("song_ids")) readPlaylistsDF = readPlaylistsDF.join( updatePlaylistsDF, readPlaylistsDF.id == updatePlaylistsDF.id, "left").select( readPlaylistsDF.id, F.coalesce(updatePlaylistsDF.song_ids, readPlaylistsDF.song_ids).alias("song_ids"), readPlaylistsDF.user_id) readPlaylistsDF.show() updatePlaylistsDF.unpersist() playlistsDF = readPlaylistsDF.agg( F.collect_list(struct("*")).alias('playlists'))
def add_null_index_array(df): df = df.replace("\\N", None) df = df.withColumn("nulls", array()) for i, c in enumerate(df.columns): df = df.withColumn("nulls", when(col(c).isNull(), array_union(col("nulls"), array(lit(i)))).otherwise(col("nulls"))) return df.drop("ingest_timestamp", "ingest_source")
MODEL = None def get_model_magic(): global MODEL if MODEL is None: MODEL = hub.load( "https://tfhub.dev/google/universal-sentence-encoder/4") return MODEL @udf(returnType=VectorUDT()) def encode_sentence(x): model = get_model_magic() emb = model([x]).numpy()[0] return Vectors.dense(emb) blocking_df = tokenize(processed_df, ['name', 'description', 'manufacturer']) blocking_df = tfidf_top_tokens( blocking_df, [c + '_swRemoved' for c in ['name', 'description', 'manufacturer']]) blocking_df = blocking_df.withColumn('name_encoding', encode_sentence(f.coalesce(f.col('name'), f.lit(''))))\ .withColumn('description_encoding', encode_sentence(f.coalesce(f.col('description'), f.lit(''))))\ .withColumn('blocking_keys', f.array_union( f.array(f.col('name'), f.col('description'), f.col('manufacturer')), f.array_union(f.col('name_swRemoved_top_tokens'), f.array_union(f.col('description_swRemoved_top_tokens'), f.col('manufacturer_swRemoved_top_tokens'))) ) )\ .withColumn('uid', f.concat_ws('|', 'source', 'source_id'))
def process(self): """ Read data. """ self.code = self.input # if self.code in self.cleanUp: # self.cleanUp_for_code(self.code) # self.prepare_dirs_for_code(self.code) # Load parquet df = spark.read.parquet( f"{config.OUTPUT_DATASET}/{self.code}_cards.parquet") # Replace text with keywords based on a dictionary df = df.withColumn( "text_features1", preprocess_fn.udf_text_to_keywords("name", "originalText")) from_patterns = [ fn.when( fn.regexp_extract("originalText", r"{0}".format(pattern), 0) != "", replace, ).otherwise("") for pattern, replace in preprocess_fn_text_rules.text_patterns.items() ] df = df.withColumn("text_features2", fn.array(*from_patterns)) df = df.withColumn("text_features", fn.array_union("text_features1", "text_features2")) # df.select("text_features").distinct().show(100, truncate=False) # Fetch all the text features from all the cards into one list all_text_feats = df.select("text_features").rdd.flatMap( lambda x: x).collect() filtered_text_feats = [ items for items in all_text_feats if len(items) > 0 ] filtered_text_feats = list( itertools.chain.from_iterable(filtered_text_feats)) # Encode the text features into ints label_encoder = preprocessing.LabelEncoder().fit(filtered_text_feats) with open(f"{config.TEMP}/labelencoder_text_feats.pkl", "wb") as fp: pickle.dump(label_encoder, fp) @fn.udf(returnType=t.ArrayType(t.IntegerType())) def text_to_vector(text_features): if len(text_features) > 0: enc_list = list() for item in text_features: item = str(item) encoded = label_encoder.transform([item]) encoded = int(encoded[0]) enc_list.append(encoded) # print(f"{item} \t {encoded}") return enc_list return list() # if "text_features_vect" in df.columns: # df = df.drop("text_features_vect") df = df.withColumn("text_features_vect", text_to_vector("text_features")) all_text_feats = df.select("text_features").rdd.flatMap( lambda x: x).collect() filtered_text_feats = [ items for items in all_text_feats if len(items) > 0 ] filtered_text_feats = list( itertools.chain.from_iterable(filtered_text_feats)) df.createOrReplaceTempView("cards_features") tbl = spark.sql(""" SELECT * FROM cards_features """) # Save to Parquet tbl.write.mode("overwrite").parquet( f"{config.TEMP}/{self.code}_cards_text.parquet") self.next(self.join)