Exemplo n.º 1
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            counts = (df.groupBy(col_name + "_buckets").agg(
                F.count(col_name + "_buckets").alias("count")).cols.rename(
                    col_name + "_buckets",
                    "value").sort(F.asc("value")).to_json())

            hist = []
            for x, y in zip(counts, splits):
                # if x["value"] is not None and x["count"] != 0:
                hist.append({
                    "lower": y["lower"],
                    "upper": y["upper"],
                    "count": x["count"]
                })

        return hist
Exemplo n.º 2
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            col_bucket = col_name + "_buckets"

            counts = (df.h_repartition(
                col_name=col_bucket).groupBy(col_bucket).agg(
                    F.count(col_bucket).alias("count")).cols.rename(
                        col_bucket, "value").sort(F.asc("value")).to_json())

            # Fill the gaps in dict values. For example if we have  1,5,7,8,9 it get 1,2,3,4,5,6,7,8,9
            new_array = []
            for i in builtins.range(buckets):
                flag = False
                for c in counts:
                    value = c["value"]
                    count = c["count"]
                    if value == i:
                        new_array.append({"value": value, "count": count})
                        flag = True
                if flag is False:
                    new_array.append({"value": i, "count": 0})

            counts = new_array

            hist_data = []
            for i in list(itertools.zip_longest(counts, splits)):
                if i[0] is None:
                    hist_data.append({
                        "count": 0,
                        "lower": i[1]["lower"],
                        "upper": i[1]["upper"]
                    })
                elif "count" in i[0]:
                    hist_data.append({
                        "count": i[0]["count"],
                        "lower": i[1]["lower"],
                        "upper": i[1]["upper"]
                    })

        return hist_data