示例#1
0
    def __call__(self, rdd: RDD):
        column_name = self.column
        if self._value_to_index is None:
            self.calculate_value_to_index(rdd)
        column2id = rdd.context.broadcast(self._value_to_index)

        def index_column(row):
            """
            Map column_id column to its index value stored in column2id.
            WARNING: due to pyspark documentation
            (http://spark.apache.org/docs/latest/rdd-programming-guide.html#passing-functions-to-spark)
            do not use self inside this function. It will be suboptimal and probably fail to run.
            Please contact me if you have troubles: [email protected]
            """
            if isinstance(column_name, str):
                try:
                    assert isinstance(row, Row)
                    row_dict = row.asDict()
                    row_dict[column_name] = column2id.value[row_dict[column_name]]
                    return [Row(**row_dict)]
                except KeyError:
                    return []
            return [row[:column_name] + (column2id.value[row[column_name]],) +
                    row[column_name + 1:]]

        indexed_rdd = rdd.flatMap(index_column)
        column2id.unpersist(blocking=True)
        return indexed_rdd
def list_unique_products(rdd: RDD):
    """[task 2a - saves all unique products in transactions into txt]

    Args:
        rdd (RDD): [spark RDD]
    """
    unique = rdd.flatMap(explode).distinct()
    unique.coalesce(1).saveAsTextFile("out/out_1_2a.txt")
示例#3
0
def make_index(words: list, rdd: RDD):
    """
    This function return a RDD containing for each word, the list of article where it's
    mentioned.
    :param words:
    :param rdd:
    :return: the word with the list of article containing this word
    """
    return rdd.flatMap(
        lambda article: filter_words(words, article)).groupByKey()
def list_product_count(rdd: RDD):
    """[task 2b - saves unique count of products into txt]

    Args:
        rdd (RDD): [description]
    """
    sc = SparkContext.getOrCreate()
    rdd = sc.parallelize([str(rdd.flatMap(explode).count())])
    header = sc.parallelize(["Count:"])
    header.union(rdd).coalesce(1).saveAsTextFile("out/out_1_2b.txt")
def list_top_purchased_products(rdd: RDD):
    """[task 3 - saves top 5 purchased products into txt file]

    Args:
        rdd (RDD): [spark RDD]
    """
    sc = SparkContext.getOrCreate()
    rdd = (sc.parallelize(
        rdd.flatMap(explode).map(lambda w: (w, 1)).reduceByKey(
            lambda a, b: a + b).takeOrdered(5,
                                            key=lambda x: -x[1])).coalesce(1))
    rdd.saveAsTextFile("out/out_1_3.txt")
示例#6
0
def rank_reduce_by_key(words: list, rdd: RDD):
    """
    This implementation combine index and computation using `reduceByKey`.
    :param words: list of word we would like to rank
    :param rdd: dataset of articles
    :return: list of pair (word, nb occ)
    """
    return rdd.flatMap(lambda article: filter_words(words, article)) \
        .map(lambda a: (a[0], 1))\
        .reduceByKey(lambda a, b: a + b) \
        .sortBy(lambda k: k[1], ascending=False) \
        .collect()
    def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list:

        def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool:
            '''
            For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8']
            are frequent items.
            :param itemset:
            :return:
            '''
            itemset_size = len(itemset)
            for i in range(itemset_size):
                subset = itemset.copy()
                subset.pop(i)
                try:
                    _ = frequent_itemsets_dict[tuple(subset)]  # 不再需要sorted这个subset,basket已sort
                except:
                    return False
            return True

        num_baskets = baskets.count()
        singleton_counts = baskets.\
            flatMap(lambda set: [(item, 1) for item in set]).\
            reduceByKey(lambda x,y: x+y).\
            filter(lambda pair: pair[1] >= support_threshold_total)
        # frequent_singletons_dict = dict(singleton_counts.collect()).keys()
        frequent_itemsets_dict = dict(singleton_counts.collect())
        # print("frequent_itemsets_dict", frequent_itemsets_dict)
        frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))]
        del singleton_counts
        gc.collect()

        # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist()  # 既然first/second pass都要用,为何不persist
        #
        # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect()  # first pass
        # bitmap = dict(bucket_counts)
        # for key, value in bitmap.items():
        #     if value >= support_threshold_total:
        #         bitmap[key] = 1
        #     else:
        #         bitmap[key] = 0

        current_itemset_size = 2
        while True:
            # print("current_itemset_size", current_itemset_size)
            # if current_itemset_size == 2: # pairs are special
            #     frequent_itemsets = all_pairs.\
            #         filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\
            #         map(lambda pair: (tuple(pair), 1)).\
            #         reduceByKey(lambda x, y: x + y).\
            #         filter(lambda pair: pair[1] >= support_threshold_total).persist()
            #     del all_pairs
            #     gc.collect()
            # else:  # 双重filter
            frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \
                map(lambda itemset: (tuple(itemset), 1)).\
                reduceByKey(lambda x,y: x+y).\
                filter(lambda pair: pair[1] >= support_threshold_total).persist()
            # if frequent_itemsets.count() == 0:
            #     break
            current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect())
            if current_size_frequent_itemsets == []:
                break

            frequent_itemsets_list.append(current_size_frequent_itemsets)
            frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets))
            # frequent_itemsets_dict.update(dict(frequent_itemsets.collect()))
            current_itemset_size += 1
            del frequent_itemsets  # 也许正确操作应该是释放内存之后再del?我不懂
            del current_size_frequent_itemsets
            gc.collect()

        gc.collect()
        return frequent_itemsets_list
示例#8
0
def into_words(rdd_in: RDD) -> RDD:
    words: RDD = rdd_in.flatMap(lambda x: x.split(' ')).filter(
        lambda x: len(x) > 1)
    return words.map(lambda x: re.sub(',()', "", x))