def fingerprint_cluster(df, input_cols): """ Cluster a dataframe column based on the Fingerprint algorithm :param df: Dataframe to be processed :param input_cols: Columns to be processed :return: """ # df = self.df input_cols = parse_columns(df, input_cols) for input_col in input_cols: output_col = name_col(input_col, FINGERPRINT_COL) # Instead of apply the fingerprint to the whole data set we group by names df = ( df.groupBy(input_col).count().select( 'count', input_col).repartition( 1) # Needed for optimization in a single machine .cache()) # Calculate the fingeprint df = fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) df = df.groupby(output_col).agg( F.collect_set(input_col).alias(cluster_col), F.sum("count").alias(count_col), F.first(input_col).alias(recommended_col), F.size(F.collect_set(input_col)).alias(cluster_size_col)).select( cluster_size_col, cluster_col, count_col, recommended_col) return df
def n_gram_fingerprint_cluster(df, input_cols, n_size=2): """ Cluster a DataFrame column based on the N-Gram Fingerprint algorithm :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) # Prepare a group so we do not need to apply the fingerprint to the whole data set df = ( df.select(input_col).groupBy(input_col).count().select( 'count', input_col).repartition( 1) # Needed for optimization in a single machine .cache()) df = n_gram_fingerprint(df, input_col, n_size) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) df = df.groupby(ngram_fingerprint_col).agg( F.collect_set(input_col).alias(cluster_col), F.sum("count").alias(count_col), F.first(input_col).alias(recommended_col), F.size(F.collect_set(input_col)).alias(cluster_size_col)).select( cluster_size_col, cluster_col, count_col, recommended_col) return df
def nunique(self, df): """ Calculates number of unique values in a column over a window""" w = self.get_window(self.partition_by, self.order_by, self.window_length) return df.withColumn( self.column_alias, psf.size(psf.collect_set(self.aggregation_column).over(w)))
def tf_idf(df, n): # Extracting terms per each row/document as a list temp_df = df.withColumn( 'terms', f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')), ' ')) # Calculating total number of words per row/document temp_df1 = temp_df.withColumn('total_num_words', f.size('terms')) # Extracting words in each documents temp_df2 = temp_df1.withColumn('token', f.explode('terms')) # Calculating tf temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({ 'token': 'count' }).withColumnRenamed('count(token)', 'occurrence').sort('_id') temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence) # Calculating df temp_df5 = temp_df4.groupBy('token').agg( f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df') # Calculating idf temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df)) # Calculating tf-idf joined_df = temp_df4.join(temp_df6, temp_df4.token == temp_df6.token).select( temp_df4.token, temp_df4._id, temp_df4.tf, temp_df6.df, temp_df6.idf) result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf) return result
def frequent_itemsets(filename, n, s, c): ''' Using the FP-Growth algorithm from the ML library (see http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), write a function that returns the first <n> frequent itemsets obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending itemset size, and (2) descending frequency. The FP-Growth model should be applied to the DataFrame computed in the previous task. Return value: a CSV string. As before, using toCSVLine may help. Test: tests/test_frequent_items.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant','items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result=model.freqItemsets result=result.select("items","freq",size("items").alias("tam")) result=result.sort(desc('tam'),desc('freq')).limit(n) result=result.select('items','freq') return toCSVLine(result)
def n_gram_fingerprint_cluster(df, columns, n_size=2): """ Cluster a DataFrame column based on the N-Gram Fingerprint algorithm :param df: :param columns: :param n_size: :return: """ columns = parse_columns(df, columns) for col_name in columns: n_gram_col = col_name + "_ngram_fingerprint" # Prepare a group so we don need to apply the fingerprint to the whole data set df = (df.select(col_name) .groupBy(col_name) .count() .select('count', col_name) .repartition(1) # Needed for optimization in a single machine .cache()) df = n_gram_fingerprint(df, col_name, n_size) # df.table() df = df.groupby(n_gram_col).agg( F.collect_set(col_name).alias("cluster"), F.sum("count").alias("count"), F.first(col_name).alias("recommended"), F.size(F.collect_set(col_name)).alias("cluster_size") ).select("cluster_size", "cluster", "count", "recommended") return df
def fingerprint_cluster(df, columns): """ Cluster a dataframe column based on the Fingerprint algorithm :param df: :param columns: Columns to be processed :return: """ # df = self.df columns = parse_columns(df, columns) for col_name in columns: output_col = col_name + "_FINGERPRINT" # Instead of apply the fingerprint to the whole data set we group by names df = (df .groupBy(col_name) .count() .select('count', col_name) .repartition(1) # Needed for optimization in a single machine .cache() ) # Calculate the fingeprint df = fingerprint(df, col_name) # Create cluster df = df.groupby(output_col).agg( F.collect_set(col_name).alias("cluster"), F.sum("count").alias("count"), F.first(col_name).alias("recommended"), F.size(F.collect_set(col_name)).alias("cluster_size") ) \ .select("cluster_size", "cluster", "count", "recommended") return df
def preprocess(self, data): data = data.withColumn( "_c0", functions.expr("substring(_c0, 2, length(_c0)-1)")) data = data.withColumn( "_c3", functions.expr("substring(_c3, 1, length(_c3)-1)")) data = data.withColumnRenamed("_c0", "form_id") \ .withColumnRenamed("_c1", "views") \ .withColumnRenamed("_c2", "submissions") \ .withColumnRenamed("_c3", "features") data = data.select('form_id', 'views', 'submissions', functions.split('features', '-').alias('features')) df_sizes = data.select(functions.size('features').alias('features')) df_max = df_sizes.agg(functions.max('features')) nb_columns = df_max.collect()[0][0] data = data.select('form_id', 'views', 'submissions', *[data['features'][i] for i in range(nb_columns)]) data = data.select(*(functions.col(column).cast("float").alias(column) for column in data.columns)) data = data.withColumn('form_id', functions.col('form_id').cast('int')) data = data.withColumn('views', functions.col('views').cast('int')) data = data.withColumn('submissions', functions.col('submissions').cast('int')) data = data.withColumn( "submission_ratio", functions.col("submissions") / functions.col("views")) return data
def amenities_rating(spark, amenities_pref, newh_df): pa_df = pd.DataFrame(amenities_pref, columns=["amenities_pref"]) a_df = spark.createDataFrame(pa_df) a_df.createOrReplaceTempView('a_df') newh_df.createOrReplaceTempView('del_dup') newa_df = spark.sql( "SELECT * FROM newh_df INNER JOIN a_df WHERE newh_df.amenities=a_df.amenities_pref" ) ameni_comb = newa_df.groupBy(functions.col("id")).agg( functions.collect_list(functions.col("amenities")).alias("amenities")) amenities_len = ameni_comb.withColumn( "ameni_len", functions.size(ameni_comb["amenities"])).orderBy( functions.col("ameni_len"), ascending=False) amenities_len.createOrReplaceTempView("amenities_len") ameni_df = spark.sql( "SELECT a.id,h.amenities,a.ameni_len FROM del_dup h INNER JOIN amenities_len a WHERE h.id=a.id ORDER BY a.ameni_len DESC" ) find_rating = functions.udf(lambda a: get_rating(a), types.IntegerType()) usr_rating = ameni_df.withColumn("rating", find_rating("ameni_len")) return usr_rating
def sort_by_comment_length(epoch_df, batch_size=16): """ TEST FUNCTION: Takes in a Spark dataframe Returns: A list of token, label tuples ordered by token sequence length """ # add sequence lengths epoch_df = epoch_df.withColumn("sequence_length", F.size(epoch_df.tokens)) # order by sequence length epoch_df = epoch_df.orderBy("sequence_length", ascending=False) # drop sequence length column epoch_df = epoch_df.drop("sequence_length") # convert pandas epoch_df = epoch_df.toPandas() # convert to sorted list of tuples sorted_tokens = [(epoch_df['tokens'].iloc[i], epoch_df['label'].iloc[i]) for i in range(len(epoch_df))] return sorted_tokens
def preparar_df(df): df.repartition(df.user.id) df = df.where(F.length(df.text) > 0) df = df.select( "*", u_parse_time( df['created_at']).cast('timestamp').alias('created_at_ts')) df_intertweet = df.select( df.user.id.alias("user_id"), (df.created_at_ts.cast('bigint') - F.lag(df.created_at_ts.cast('bigint'), ).over( Window.partitionBy("user.id").orderBy("created_at_ts")) ).cast("bigint").alias("time_intertweet")) df_list_intertweet = df_intertweet.groupby(df_intertweet.user_id).agg( F.collect_list("time_intertweet").alias("lista_intertweet")) df_list_intertweet = df_list_intertweet.filter( F.size(df_list_intertweet.lista_intertweet) > 3) df = df.join(df_list_intertweet, df["user.id"] == df_list_intertweet["user_id"]) return df
def process(): data_content = [x.strip().split(',') for x in open(FILE_PATH).readlines()] data_content_tuple = [] for i in range(0, len(data_content)): data_content_tuple.append((i, data_content[i])) df = spark.createDataFrame(data_content_tuple, ["id", "items"]) fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.5) model = fpGrowth.fit(df) # Display frequent itemsets. # model.freqItemsets model.freqItemsets.filter(size('items') > 0).orderBy('freq', ascending=0).show( 50, False) print(type(model.freqItemsets)) # Display generated association rules. model.associationRules.orderBy('confidence', ascending=0).show(200, False) # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show(50, False)
def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map( lambda x: (x[1], x[0][0], x[0][1:])) df = spark.createDataFrame(result, ['id', 'plant', 'items']) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) result = model.associationRules modelResult = model.freqItemsets result=modelResult.join(result,modelResult['items']==result["consequent"]) total = df.count() result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total)) result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest") result = result.sort(desc('tam'), desc('interest')).limit(n) result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest") return toCSVLine(result)
def generate_TFIDF(sc, df , sqlcontext): # 1. calculate the number of rows(documents) in data framework t_num = df.count() # 2. select _id and lower the text_entry and remove punctuation symbols #and then split it as a list of words('tokens') word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens')) # 3. explode the list of words to generate a list of _id and token #then, group the list base on _id and token to calculate frequency of tokens (tf) in each row # to create a data framework words_tf (_id , token , tf) words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\ .groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf") # 4. to calculate frequency of token in document (df), I aggregate the list base on token # and created a set of _ids with duplicate _ids eliminated ('collect_set') # and calculated the number of _ids and document frequency of a token # to create a data framework words_df (_id , token , df) words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\ .select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df')) # 5. to calculate the final TFIDF data framework, I joined # I joined two data frameworks words_tf and words_df base on same _id and token # then calculated the idf by fraction of number of documents (t_num) on document frequency (df) # then calculated the tf_idf by multiplying idf and tf tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) & (words_tf.token == words_df.token))\ .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\ , (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") ) # 6. cache the TFIDF data framework for further usage tokensWithTfIdf.cache() return tokensWithTfIdf
def interests(filename, n, s, c): ''' Using the same FP-Growth algorithm, write a script that computes the interest of association rules (interest = |confidence - frequency(consequent)|; note the absolute value) obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), and prints the first <n> rules sorted by (1) descending antecedent size in association rule, and (2) descending interest. Return value: a CSV string. Test: tests/test_interests.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:])) df = spark.createDataFrame(rdd_data) total_count = df.count() fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) model_updated = model.associationRules.join( model.freqItemsets, model.associationRules['consequent'] == model.freqItemsets['items']) model_with_interest = model_updated.withColumn( "interest", lit( calculate_interest(model_updated.confidence, model_updated.freq, total_count))) model_1 = model_with_interest.drop("lift") model_2 = model_1.orderBy([size("antecedent"), "interest"], ascending=[0, 0]) final_op = toCSVLine(model_2.limit(n)) return final_op
def save_table(df, table_name, partition_keys=None): print(f"Saving table: {table_name}") output_path = f"s3://{bucket_name}/{output_dir}/{table_name}" spark.sql(f"drop table if exists {database_name}.{table_name}") df = df\ .withColumn('dataset_name', f.split(f.split(f.input_file_name(), '/').getItem(f.size(f.split(f.input_file_name(), '/'))-1), '\.').getItem(0)) if partition_keys is not None: df\ .repartition(*partition_keys)\ .write\ .mode("overwrite")\ .format("parquet")\ .partitionBy(*partition_keys)\ .option("path", output_path)\ .saveAsTable(f"{database_name}.{table_name}") else: df\ .coalesce(1)\ .write\ .mode("overwrite")\ .format("parquet")\ .option("path", output_path)\ .saveAsTable(f"{database_name}.{table_name}") print(f"Table: {table_name} saved")
def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def add_user_roles(wmhist, remember_dict): def role_filter(rg, role_set): if rg is None: return False else: return any(role in role_set for role in rg) py_is_admin = lambda rg: role_filter(rg, {"bureaucrat","sysop","steward","arbcom"}) py_is_bot = lambda rg: role_filter(rg, {"copyviobot","bot"}) py_is_patroller = lambda rg: role_filter(rg, {"patroller"}) udf_is_admin = f.udf(py_is_admin,BooleanType()) udf_is_bot = f.udf(py_is_bot,BooleanType()) udf_is_patroller = f.udf(py_is_patroller, BooleanType()) wmhist = wmhist.withColumn("event_user_isadmin", udf_is_admin(wmhist.event_user_groups)) wmhist = wmhist.withColumn("event_user_isbot1", udf_is_bot(wmhist.event_user_groups)) wmhist = wmhist.withColumn("event_user_ispatroller", udf_is_patroller(wmhist.event_user_groups)) wmhist = wmhist.withColumn("event_user_isbot2", f.size(wmhist.event_user_is_bot_by) > 0) wmhist = wmhist.withColumn("role_type", f.when(wmhist.event_user_isadmin, "admin").otherwise( f.when( (wmhist.event_user_isbot1) | (wmhist.event_user_isbot2),"bot").otherwise( f.when(wmhist.event_user_ispatroller, "patroller").otherwise("other") ))) return (wmhist, remember_dict)
def levenshtein_cluster(df, input_col): """ Return a dataframe with a string of cluster related to a string :param df: :param input_col: :return: """ # Prepare a group so we don need to apply the fingerprint to the whole data set df = df.select(input_col).groupby(input_col).agg(F.count(input_col).alias("count")) df = keycollision.fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) fingerprint_col = name_col(input_col, FINGERPRINT_COL) df_t = df.groupby(fingerprint_col).agg(F.collect_list(input_col).alias(cluster_col), F.size(F.collect_list(input_col)).alias(cluster_size_col), F.first(input_col).alias(recommended_col), F.sum("count").alias(count_col)).repartition(1) # Filter nearest string df_l = levenshtein_filter(df, input_col).repartition(1) # Create Cluster df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \ .cols.drop(fingerprint_col) \ .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"]) return df_l
def get_basket_items(sdf, item_col, *key_cols, include_duplicate_items=True, exclude_single_item_baskets=True): """ generate sets of items from a table listing items individually (along with the group they belong to) Args: sdf: input Spark dataframe item_col: the name of the column indicating the item *key_cols: the names of the columns that collectively indicate the group the item should be placed in include_duplicate_items: exclude_single_item_basekets: if Trye, baskets with only a single item will be filtered out Notes: This function is more or less equivalent to this SQL query: select patient, encounters.id encounter, encounter_date, collect_list(distinct condition) items, count(distinct condition) num_items from encounters group by patient, encounter, encounter_date """ collect_fun = fn.collect_list if include_duplicate_items else fn.collect_set basket_items = sdf \ .groupby(*key_cols) \ .agg( collect_fun(item_col).alias('items') ) if exclude_single_item_baskets: basket_items = basket_items.filter(fn.size('items') > 1) return basket_items
def frequent_itemsets(filename, n, s, c): ''' Using the FP-Growth algorithm from the ML library (see http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), write a function that returns the first <n> frequent itemsets obtained using min support <s> and min confidence <c> (parameters of the FP-Growth model), sorted by (1) descending itemset size, and (2) descending frequency. The FP-Growth model should be applied to the DataFrame computed in the previous task. Return value: a CSV string. As before, using toCSVLine may help. Test: tests/test_frequent_items.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:])) df = spark.createDataFrame(rdd_data) fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c) model = fpGrowth.fit(df) model_1 = model.freqItemsets.orderBy([size("items"), "freq"], ascending=[0, 0]) final_op = toCSVLine(model_1.limit(n)) return final_op '''return "not implemented"'''
def main(inputs): poi = spark.read.json(inputs, schema=amenity_schema) poi = poi.filter((poi['lon'] > -123.5) & (poi['lon'] < -122)) poi = poi.filter((poi['lat'] > 49) & (poi['lat'] < 49.5)) #poi = poi.coalesce(1) # ~1MB after the filtering transportations_data = poi.filter(poi.amenity.isin(transportations)) schools_data = poi.filter(poi.amenity.isin(schools)) bike_parking_data = transportations_data.filter( (transportations_data['amenity'] == 'bicycle_parking') & (functions.size('tags') > 0)) fuel_data = transportations_data.filter( transportations_data['amenity'] == 'fuel') transportations_data.write.json('../transportations-Vancouver', mode='overwrite', compression='gzip') schools_data.write.json('../schools-Vancouver', mode='overwrite', compression='gzip') bike_parking_data.write.json('../bikes-Vancouver', mode='overwrite', compression='gzip') fuel_data.write.json('../fuel-Vancouver', mode='overwrite', compression='gzip')
def pyldavis_data_format(tokenized_df, count_vectorizer, transformed, lda_model): word_counts = tokenized_df.select((explode( tokenized_df.documents)).alias("words")).groupby("words").count() word_counts_list = {r['words']: r['count'] for r in word_counts.collect()} word_counts_list = [ word_counts_list[w] for w in count_vectorizer.vocabulary ] #Create data with key-value pairs as expected by the pyLDAvis tool data = { 'topic_term_dists': np.array(lda_model.topicsMatrix().toArray()).T, 'doc_topic_dists': np.array([ x.toArray() for x in transformed.select( ["topicDistribution"]).toPandas()['topicDistribution'] ]), 'doc_lengths': [ r[0] for r in tokenized_df.select(size( tokenized_df.documents)).collect() ], 'vocab': count_vectorizer.vocabulary, 'term_frequency': word_counts_list } return data
def count_categories(df): """ [[Category:Category name]] [[:Category:Category name]] [[:File:File name]] """ pattern = '\[\[:?Category:[a-zA-Z0-9.,\-!?\(\) ]+\]\]' return df.withColumn('n_categories', size(split(col('text'), pattern)) - 1)
def count_unreferenced(df): """ <ref>Lots of words</ref> -- reference without a link {{cn}} -- citation needed """ pattern = '\{\{cn\}\}|<ref>[a-zA-Z0-9.,!? ]+</ref>' return df.withColumn('n_unreferenced', size(split(col('text'), pattern)) - 1)
def count_of_images(df): """ [[File: | thumb | upright | right | alt= | caption ]] """ any_text = "[a-zA-Z0-9.,!? ]+ \] " pattern = "\[[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\|[a-zA-Z0-9.,!? ]+\]" return df.withColumn("n_images", size(split(col('text'), pattern=pattern)) - 1)
def count_items(df, parent_feature, column): name = parent_feature.get_output_columns()[0] #df = df.withColumn(name, F.lit(10)) df = df.withColumn(name, F.size(F.split(F.col(column), r"name")) - 1) return df
def similaryBasedOnFollowers(data, minFollowers=20, debug=False): # We start by renaming the user column in line with the notation # above. data = data.withColumnRenamed('follows', 'u1') # ==== Step 1 ==== u1_fu1 = data.groupBy('u1').agg(F.collect_set( data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers) if (debug): print('>> Step 1 :: u1 f(u1) <<') u1_fu1.show() # ==== Step 2 ==== # First create a "dual" of data by renaming columns. # This will help the subsequent join. u2_fu2 = u1_fu1.withColumnRenamed('u1', 'u2').withColumnRenamed('fu1', 'fu2') prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2) if (debug): print('>> Step 2 :: u1 f(u1) u2 f(u2) <<') prod.show() # ==== Step 3 ==== prod2 = prod.withColumn('I', F.array_intersect(prod.fu1, prod.fu2)).withColumn( 'U', F.array_union(prod.fu1, prod.fu2)).drop('fu1', 'fu2') if (debug): print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<') #prod2.orderBy('I',ascending=False).show() prod2.show() # ==== Step 4 ==== result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U') if (debug): print('>> Step 4 :: u1 u2 J(u1,u2) <<') result.show() return result
def sdf_pooling_sequence(sdf, col=None, length=None, mode='mean'): if col is None: col = sdf.columns[0] if length is None: length = sdf.select(F.size(col).alias('length')).take(1)[0]['length'] sdf = sdf.select([F.col(col)[i].alias(f'temp_{i}') for i in range(length)]) sdf = eval(f"sdf.groupby().{mode}()") sdf = sdf.select(F.array(sdf.columns).alias(col)) return sdf
def test_slice(self): from pyspark.sql.functions import lit, size, slice df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x']) self.assertEqual( df.select(slice(df.x, 2, 2).alias("sliced")).collect(), df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(), ) self.assertEqual( df.select(slice(df.x, size(df.x) - 1, lit(1)).alias("sliced")).collect(), [Row(sliced=[2]), Row(sliced=[4])]) self.assertEqual( df.select(slice(df.x, lit(1), size(df.x) - 1).alias("sliced")).collect(), [Row(sliced=[1, 2]), Row(sliced=[4])])
def wordCount(df, colName): """ Args: df: a DataFrame colName: a column for counting the number of words in it Returns: df: a DataFrame with one more column word_count of colName """ return df.withColumn('word_count', f.size(f.split(f.col(colName), ' ')))
# COMMAND ---------- from pyspark.sql.functions import split df.select(split(col("Description"), " ")).show(2) # COMMAND ---------- df.select(split(col("Description"), " ").alias("array_col"))\ .selectExpr("array_col[0]").show(2) # COMMAND ---------- from pyspark.sql.functions import size df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3 # COMMAND ---------- from pyspark.sql.functions import array_contains df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2) # COMMAND ---------- from pyspark.sql.functions import split, explode df.withColumn("splitted", split(col("Description"), " "))\ .withColumn("exploded", explode(col("splitted")))\ .select("Description", "InvoiceNo", "exploded").show(2)
# MAGIC %md # MAGIC Use our `removeWords` function that we registered in wiki-eda to clean up stop words. # COMMAND ---------- sqlContext.sql("drop table if exists words") words.registerTempTable("words") # COMMAND ---------- noStopWords = sqlContext.sql("select removeWords(words) as words from words") # .cache() display(noStopWords) # COMMAND ---------- wordVecInput = noStopWords.filter(func.size("words") != 0) wordVecInput.count() # COMMAND ---------- # MAGIC %md # MAGIC Build the `Word2Vec` model. This take about a minute with two workers. # COMMAND ---------- from pyspark.ml.feature import Word2Vec word2Vec = Word2Vec(vectorSize=150, minCount=50, inputCol="words", outputCol="result", seed=0) model = word2Vec.fit(wordVecInput) # COMMAND ----------
# MAGIC %md # MAGIC Use our `removeWords` function that we registered in wiki-eda to clean up stop words. # COMMAND ---------- sqlContext.sql('drop table if exists words') words.registerTempTable('words') # COMMAND ---------- noStopWords = sqlContext.sql('select removeWords(words) as words from words') #.cache() display(noStopWords) # COMMAND ---------- wordVecInput = noStopWords.filter(func.size('words') != 0) wordVecInput.count() # COMMAND ---------- # MAGIC %md # MAGIC Build the `Word2Vec` model. This take about a minute with two workers. # COMMAND ---------- from pyspark.ml.feature import Word2Vec word2Vec = Word2Vec(vectorSize=150, minCount=50, inputCol='words', outputCol='result', seed=0) model = word2Vec.fit(wordVecInput) # COMMAND ----------
# MAGIC %md # MAGIC Calculate the number of words in `noStopWords`. Recall that each row contains an array of words. # MAGIC # MAGIC One strategy would be to take the length of each row and sum the lengths. To do this use `functions.size`, `functions.sum`, and call `.agg` on the `DataFrame`. # MAGIC # MAGIC Don't forget to refer to the [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.package) APIs. For example you'll find detail for the function `size` in the [functions module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.size) in Python and the [functions package](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$) in Scala. # COMMAND ---------- # MAGIC %md # MAGIC First, create a `DataFrame` named sized that has a `size` column with the size of each array of words. Here you can use `func.size`. # COMMAND ---------- # ANSWER sized = noStopWords.withColumn('size', func.size('words')) sizedFirst = sized.select('size', 'words').first() print sizedFirst[0] # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(sizedFirst[0], len(sizedFirst[1]), 'incorrect implementation for sized') # COMMAND ---------- # MAGIC %md # MAGIC Next, you'll need to aggregate the counts. You can do this using `func.sum` in either a `.select` or `.agg` method call on the `DataFrame`. Make sure to give your `Column` the alias `numberOfWords`. There are some examples in [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.GroupedData.agg) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrame) in the APIs.