def __call__(self, rdd: RDD): column_name = self.column if self._value_to_index is None: self.calculate_value_to_index(rdd) column2id = rdd.context.broadcast(self._value_to_index) def index_column(row): """ Map column_id column to its index value stored in column2id. WARNING: due to pyspark documentation (http://spark.apache.org/docs/latest/rdd-programming-guide.html#passing-functions-to-spark) do not use self inside this function. It will be suboptimal and probably fail to run. Please contact me if you have troubles: [email protected] """ if isinstance(column_name, str): try: assert isinstance(row, Row) row_dict = row.asDict() row_dict[column_name] = column2id.value[row_dict[column_name]] return [Row(**row_dict)] except KeyError: return [] return [row[:column_name] + (column2id.value[row[column_name]],) + row[column_name + 1:]] indexed_rdd = rdd.flatMap(index_column) column2id.unpersist(blocking=True) return indexed_rdd
def list_unique_products(rdd: RDD): """[task 2a - saves all unique products in transactions into txt] Args: rdd (RDD): [spark RDD] """ unique = rdd.flatMap(explode).distinct() unique.coalesce(1).saveAsTextFile("out/out_1_2a.txt")
def make_index(words: list, rdd: RDD): """ This function return a RDD containing for each word, the list of article where it's mentioned. :param words: :param rdd: :return: the word with the list of article containing this word """ return rdd.flatMap( lambda article: filter_words(words, article)).groupByKey()
def list_product_count(rdd: RDD): """[task 2b - saves unique count of products into txt] Args: rdd (RDD): [description] """ sc = SparkContext.getOrCreate() rdd = sc.parallelize([str(rdd.flatMap(explode).count())]) header = sc.parallelize(["Count:"]) header.union(rdd).coalesce(1).saveAsTextFile("out/out_1_2b.txt")
def list_top_purchased_products(rdd: RDD): """[task 3 - saves top 5 purchased products into txt file] Args: rdd (RDD): [spark RDD] """ sc = SparkContext.getOrCreate() rdd = (sc.parallelize( rdd.flatMap(explode).map(lambda w: (w, 1)).reduceByKey( lambda a, b: a + b).takeOrdered(5, key=lambda x: -x[1])).coalesce(1)) rdd.saveAsTextFile("out/out_1_3.txt")
def rank_reduce_by_key(words: list, rdd: RDD): """ This implementation combine index and computation using `reduceByKey`. :param words: list of word we would like to rank :param rdd: dataset of articles :return: list of pair (word, nb occ) """ return rdd.flatMap(lambda article: filter_words(words, article)) \ .map(lambda a: (a[0], 1))\ .reduceByKey(lambda a, b: a + b) \ .sortBy(lambda k: k[1], ascending=False) \ .collect()
def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool: ''' For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8'] are frequent items. :param itemset: :return: ''' itemset_size = len(itemset) for i in range(itemset_size): subset = itemset.copy() subset.pop(i) try: _ = frequent_itemsets_dict[tuple(subset)] # 不再需要sorted这个subset,basket已sort except: return False return True num_baskets = baskets.count() singleton_counts = baskets.\ flatMap(lambda set: [(item, 1) for item in set]).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total) # frequent_singletons_dict = dict(singleton_counts.collect()).keys() frequent_itemsets_dict = dict(singleton_counts.collect()) # print("frequent_itemsets_dict", frequent_itemsets_dict) frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))] del singleton_counts gc.collect() # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist() # 既然first/second pass都要用,为何不persist # # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect() # first pass # bitmap = dict(bucket_counts) # for key, value in bitmap.items(): # if value >= support_threshold_total: # bitmap[key] = 1 # else: # bitmap[key] = 0 current_itemset_size = 2 while True: # print("current_itemset_size", current_itemset_size) # if current_itemset_size == 2: # pairs are special # frequent_itemsets = all_pairs.\ # filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\ # map(lambda pair: (tuple(pair), 1)).\ # reduceByKey(lambda x, y: x + y).\ # filter(lambda pair: pair[1] >= support_threshold_total).persist() # del all_pairs # gc.collect() # else: # 双重filter frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \ map(lambda itemset: (tuple(itemset), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).persist() # if frequent_itemsets.count() == 0: # break current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect()) if current_size_frequent_itemsets == []: break frequent_itemsets_list.append(current_size_frequent_itemsets) frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets)) # frequent_itemsets_dict.update(dict(frequent_itemsets.collect())) current_itemset_size += 1 del frequent_itemsets # 也许正确操作应该是释放内存之后再del?我不懂 del current_size_frequent_itemsets gc.collect() gc.collect() return frequent_itemsets_list
def into_words(rdd_in: RDD) -> RDD: words: RDD = rdd_in.flatMap(lambda x: x.split(' ')).filter( lambda x: len(x) > 1) return words.map(lambda x: re.sub(',()', "", x))