예제 #1
0
    def test_repartition_df(self, input_df):
        result_df = repartition_df(dataframe=input_df, partition_by=["timestamp"])

        # Only one partition id, meaning data is not partitioned
        assert input_df.select(spark_partition_id()).distinct().count() == 1
        # Desired number of partitions
        assert result_df.select(spark_partition_id()).distinct().count() == 200
    def test__repartition_df(self, spark_session, spark_context):
        # arrange
        start = datetime.datetime(year=1970, month=1, day=1)
        end = datetime.datetime(year=2020, month=12, day=31)
        random_dates = [
            (
                lambda: start + datetime.
                timedelta(seconds=random.randint(  # noqa: S311
                    0, int(
                        (end - start).total_seconds()))))().date().isoformat()
            for _ in range(10000)
        ]
        data = [{"timestamp": date} for date in random_dates]
        input_df = spark_session.read.json(spark_context.parallelize(data, 1),
                                           schema="timestamp timestamp")

        writer = HistoricalFeatureStoreWriter()

        # act
        result_df = writer._create_partitions(input_df)

        # assert
        # Only one partition id, meaning data is not partitioned
        assert input_df.select(spark_partition_id()).distinct().count() == 1
        # Desired number of partitions
        assert result_df.select(spark_partition_id()).distinct().count() == 200
예제 #3
0
def main(spark: SparkSession, inputfile: str, output_dir: str):
    logger.info(spark.version)
    logger.info(inputfile)
    logger.info(output_dir)
    flights_df = (spark.read.format("parquet").load(inputfile))

    # Write DataFrame in parquet format
    logger.info("flights_df : Number of Partitions : " +
                str(flights_df.rdd.getNumPartitions()))
    flights_df.groupBy(spark_partition_id()).count().show()
    # flights_df.write.mode('overwrite').parquet(f"{output_dir}org/")

    flights_partition_df = flights_df.repartition(5)
    logger.info("flights_partition_df : Number of Partitions : " +
                str(flights_partition_df.rdd.getNumPartitions()))
    flights_partition_df.groupBy(spark_partition_id()).count().show()

    # Write DataFrame with Partitions
    flights_partition_df.write \
        .mode('overwrite') \
        .partitionBy("OP_CARRIER", "ORIGIN") \
        .parquet(f"{output_dir}part_data/")

    # Write DataFrame with Partitions and control file sizes
    flights_partition_df.write \
        .mode('overwrite') \
        .partitionBy("OP_CARRIER", "ORIGIN") \
        .option("maxRecordsPerFile", 10000) \
        .parquet(f"{output_dir}part_size_data/")

    logger.info("done")
예제 #4
0
    def attach_default_index(sdf, default_index_type=None):
        """
        This method attaches a default index to Spark DataFrame. Spark does not have the index
        notion so corresponding column should be generated.
        There are several types of default index can be configured by `compute.default_index_type`.
        """
        if default_index_type is None:
            default_index_type = get_option("compute.default_index_type")
        if default_index_type == "sequence":
            sequential_index = F.row_number().over(
                Window.orderBy(NATURAL_ORDER_COLUMN_NAME)) - 1
            scols = [scol_for(sdf, column) for column in sdf.columns]
            return sdf.select(sequential_index.alias(SPARK_INDEX_NAME_FORMAT(0)), *scols)
        elif default_index_type == "distributed-sequence":
            # 1. Calculates counts per each partition ID. `counts` here is, for instance,
            #     {
            #         1: 83,
            #         6: 83,
            #         3: 83,
            #         ...
            #     }
            counts = map(lambda x: (x["key"], x["count"]),
                         sdf.groupby(F.spark_partition_id().alias("key")).count().collect())

            # 2. Calculates cumulative sum in an order of partition id.
            #     Note that it does not matter if partition id guarantees its order or not.
            #     We just need a one-by-one sequential id.

            # sort by partition key.
            sorted_counts = sorted(counts, key=lambda x: x[0])
            # get cumulative sum in an order of partition key.
            cumulative_counts = accumulate(map(lambda count: count[1], sorted_counts))
            # zip it with partition key.
            sums = dict(zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

            return_schema = StructType(
                [StructField(SPARK_INDEX_NAME_FORMAT(0), LongType())] + list(sdf.schema))
            columns = [f.name for f in return_schema]

            # 3. Group by partition id and assign each range.
            def default_index(pdf):
                current_partition_max = sums[pdf["__spark_partition_id"].iloc[0]]
                offset = len(pdf)
                pdf[SPARK_INDEX_NAME_FORMAT(0)] = list(range(
                    current_partition_max - offset, current_partition_max))
                return pdf[columns]

            grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(default_index)

            sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id())
            return sdf.groupBy("__spark_partition_id").apply(grouped_map_func)
        elif default_index_type == "distributed":
            scols = [scol_for(sdf, column) for column in sdf.columns]
            return sdf.select(
                F.monotonically_increasing_id().alias(SPARK_INDEX_NAME_FORMAT(0)), *scols)
        else:
            raise ValueError("'compute.default_index_type' should be one of 'sequence',"
                             " 'distributed-sequence' and 'distributed'")
예제 #5
0
    def test_repartition_sort_df_processors_partitions(self, input_df):
        result_df = repartition_sort_df(
            dataframe=input_df,
            partition_by=["timestamp"],
            order_by=["timestamp"],
            num_partitions=50,
        )

        # Only one partition id, meaning data is not partitioned
        assert input_df.select(spark_partition_id()).distinct().count() == 1
        # Desired number of partitions
        assert result_df.select(spark_partition_id()).distinct().count() == 50
예제 #6
0
def encode_shares(
    batch_id,
    n_data,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    output_a,
    output_b,
):
    click.echo("Running encode shares")
    spark = spark_session()
    shares = (spark.read.json(input).withColumn(
        "pid", F.spark_partition_id()).groupBy("pid").applyInPandas(
            lambda pdf: udf.encode(batch_id, n_data, public_key_hex_internal,
                                   public_key_hex_external, pdf),
            schema="a: binary, b: binary",
        ).withColumn("id",
                     F.udf(lambda: str(uuid4()), returnType="string")()))
    shares.cache()
    row = shares.first()
    dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) *
                           n_rows * scale * 1.0 / 10**6)
    num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb)
    click.echo(f"writing {num_partitions} partitions")
    repartitioned = shares.repartitionByRange(num_partitions, "id").cache()
    repartitioned.select("id",
                         F.base64("a").alias("payload")).write.json(
                             output_a, mode="overwrite")
    repartitioned.select("id",
                         F.base64("b").alias("payload")).write.json(
                             output_b, mode="overwrite")
예제 #7
0
def partition_iterator(sdf):
    import pyspark.sql.functions as F
    sdf_part = sdf.withColumn('partition', F.spark_partition_id())
    sdf_part.cache()
    for part in range(sdf.rdd.getNumPartitions()):
        yield sdf_part.where(F.col('partition') == part).drop(
            'partition').rdd.toLocalIterator()
예제 #8
0
 def _launch_analysis(self, ds, df, udf, columns):
     histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1
     return ds, df.select(udf(*columns).alias('histos')) \
                  .withColumn('hpid', fn.spark_partition_id() % histo_map_parts) \
                  .repartition(histo_map_parts, 'hpid') \
                  .groupBy('hpid').apply(reduce_histos) \
                  .groupBy().agg(agg_histos('histos')) \
                  .toPandas()
예제 #9
0
    def attach_distributed_sequence_column(sdf, column_name):
        """
        This method attaches a Spark column that has a sequence in a distributed manner.
        This is equivalent to the column assigned when default index type 'distributed-sequence'.

        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = _InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.sort("sequence").show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """

        scols = [scol_for(sdf, column) for column in sdf.columns]

        # 1. Calculates counts per each partition ID. `counts` here is, for instance,
        #     {
        #         1: 83,
        #         6: 83,
        #         3: 83,
        #         ...
        #     }
        sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id())
        counts = map(
            lambda x: (x["key"], x["count"]),
            sdf.groupby(sdf["__spark_partition_id"].alias("key")).count().collect(),
        )

        # 2. Calculates cumulative sum in an order of partition id.
        #     Note that it does not matter if partition id guarantees its order or not.
        #     We just need a one-by-one sequential id.

        # sort by partition key.
        sorted_counts = sorted(counts, key=lambda x: x[0])
        # get cumulative sum in an order of partition key.
        cumulative_counts = [0] + list(accumulate(map(lambda count: count[1], sorted_counts)))
        # zip it with partition key.
        sums = dict(zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

        # 3. Attach offset for each partition.
        @pandas_udf(LongType(), PandasUDFType.SCALAR)
        def offset(id):
            current_partition_offset = sums[id.iloc[0]]
            return pd.Series(current_partition_offset).repeat(len(id))

        sdf = sdf.withColumn("__offset__", offset("__spark_partition_id"))

        # 4. Calculate row_number in each partition.
        w = Window.partitionBy("__spark_partition_id").orderBy(F.monotonically_increasing_id())
        row_number = F.row_number().over(w)
        sdf = sdf.withColumn("__row_number__", row_number)

        # 5. Calculate the index.
        return sdf.select(F.expr("__offset__ + __row_number__ - 1").alias(column_name), *scols)
예제 #10
0
def test_hash_repartition_exact(gen, num_parts):
    data_gen = gen[0]
    part_on = gen[1]
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : gen_df(spark, data_gen, length=1024)\
                    .repartition(num_parts, *part_on)\
                    .withColumn('id', f.spark_partition_id())\
                    .withColumn('hashed', f.hash(*part_on))\
                    .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)))
예제 #11
0
    def _is_monotonic(self, order):
        assert order in ("increasing", "decreasing")

        sdf = self._internal.spark_frame

        sdf = (
            sdf.select(
                F.spark_partition_id().alias(
                    "__partition_id"
                ),  # Make sure we use the same partition id in the whole job.
                F.col(NATURAL_ORDER_COLUMN_NAME),
                self.spark_column.alias("__origin"),
            )
            .select(
                F.col("__partition_id"),
                F.col("__origin"),
                self._is_locally_monotonic_spark_column(order).alias(
                    "__comparison_within_partition"
                ),
            )
            .groupby(F.col("__partition_id"))
            .agg(
                F.min(F.col("__origin")).alias("__partition_min"),
                F.max(F.col("__origin")).alias("__partition_max"),
                F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True))).alias(
                    "__comparison_within_partition"
                ),
            )
        )

        # Now we're windowing the aggregation results without partition specification.
        # The number of rows here will be as the same of partitions, which is expected
        # to be small.
        window = Window.orderBy(F.col("__partition_id")).rowsBetween(-1, -1)
        if order == "increasing":
            comparison_col = F.col("__partition_min") >= F.lag(F.col("__partition_max"), 1).over(
                window
            )
        else:
            comparison_col = F.col("__partition_min") <= F.lag(F.col("__partition_max"), 1).over(
                window
            )

        sdf = sdf.select(
            comparison_col.alias("__comparison_between_partitions"),
            F.col("__comparison_within_partition"),
        )

        ret = sdf.select(
            F.min(F.coalesce(F.col("__comparison_between_partitions"), F.lit(True)))
            & F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True)))
        ).collect()[0][0]
        if ret is None:
            return True
        else:
            return ret
예제 #12
0
def test_total_share(spark, root, args):
    raw = spark.read.json(str(root / "server_a" / "raw"))
    internal = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify2")
    )
    external = spark.read.json(
        str(root / "server_a" / "intermediate" / "external" / "verify2")
    )

    aggregates = (
        raw.select("id", F.unbase64("payload").alias("shares"))
        .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id")
        .join(external.select("id", F.unbase64("payload").alias("external")), on="id")
        .repartition(2)
        .withColumn("pid", F.spark_partition_id())
        .groupBy("pid")
        .applyInPandas(
            lambda pdf: udf.aggregate(
                args.batch_id,
                args.n_data,
                args.server_id,
                args.private_key_hex,
                args.shared_secret,
                args.public_key_hex_internal,
                args.public_key_hex_external,
                pdf,
            ),
            schema="payload binary, error int, total int",
        )
    )
    aggregates.show()
    rows = aggregates.collect()
    assert len(rows) == 2
    assert {2, 3} == set(r.total for r in rows)
    assert all(r.error == 0 for r in rows)

    total_share = aggregates.groupBy().applyInPandas(
        lambda pdf: udf.total_share(
            args.batch_id,
            args.n_data,
            args.server_id,
            args.private_key_hex,
            args.shared_secret,
            args.public_key_hex_internal,
            args.public_key_hex_external,
            pdf,
        ),
        schema="payload binary, error int, total int",
    )
    total_share.show()

    rows = total_share.collect()
    assert len(rows) == 1
    assert len(rows[0].payload) > 0
    assert rows[0].total == 5
    assert rows[0].error == 0
예제 #13
0
 def _launch_analysis(self, ds, df, udf, columns):
     histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1
     return (
         ds,
         df.select(udf(*columns).alias("histos"))
         .withColumn("hpid", fn.spark_partition_id() % histo_map_parts)
         .repartition(histo_map_parts, "hpid")
         .groupBy("hpid")
         .apply(reduce_histos)
         .groupBy()
         .agg(agg_histos("histos"))
         .toPandas(),
     )
예제 #14
0
    def __show_partitions_count(self, cardo_context, result):
        """
		:type result: list of CardoDataFrame
		"""
        for dataframe_index, cardo_dataframe in enumerate(result):
            cardo_context.logger.debug(
                'showing partitions of dataframe #{}'.format(dataframe_index))
            if cardo_dataframe.payload_type == 'dataframe':
                partitions = cardo_dataframe.dataframe.groupBy(
                    F.spark_partition_id()).count().collect()
                for partition in partitions:
                    cardo_context.logger.debug('partition id #{}: {}'.format(
                        partition[0], partition[1]))
            else:
                cardo_context.logger.debug(
                    'Cannot show partition status for {}'.format(
                        cardo_dataframe.payload_type))
예제 #15
0
def aggregate(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    input_internal,
    input_external,
    output,
):
    """Generate an aggregate share from a batch of verified SNIPs"""
    click.echo("Running aggregate")
    spark = spark_session()
    shares = spark.read.json(input)
    internal = spark.read.json(input_internal)
    external = spark.read.json(input_external)

    args = [
        batch_id,
        n_data,
        server_id,
        private_key_hex,
        b64decode(shared_secret),
        public_key_hex_internal,
        public_key_hex_external,
    ]
    (shares.join(internal.withColumnRenamed("payload", "internal"),
                 on="id").join(
                     external.withColumnRenamed("payload", "external"),
                     on="id").select(
                         F.unbase64("payload").alias("shares"),
                         F.unbase64("internal").alias("internal"),
                         F.unbase64("external").alias("external"),
                         F.spark_partition_id().alias("pid"),
                     ).groupBy("pid").applyInPandas(
                         lambda pdf: udf.aggregate(*args, pdf),
                         schema="payload: binary, error: int, total: int",
                     ).groupBy().applyInPandas(
                         lambda pdf: udf.total_share(*args, pdf),
                         schema="payload: binary, error: int, total: int",
                     ).withColumn("payload", F.base64("payload"))).write.json(
                         output, mode="overwrite")
예제 #16
0
def tf_serving_with_broadcasted_model(df, model_base_path=None, model_version=None, model_full_path=None, signature_def_key=None):
    import pyspark.sql.functions as F
    import pyspark.sql.types as T
    model = load_model(model_base_path, model_version,
                       model_full_path, signature_def_key)
    tf_output_schema = fetch_tensors_spark_schema(model.fetch_tensors)
    output_schema = T.StructType(df.schema.fields + tf_output_schema.fields)

    graph_def, feed_names, fetch_names, extra_ops = GraphDefPredictor.export_model(
        model)
    graph_def_serialized_bc = df.rdd.context.broadcast(graph_def)

    def func(pandas_df):
        """
        Batch inference on a panda dataframe
        """
        predictor_model = GraphDefPredictor(
            graph_def_serialized_bc.value, feed_names, fetch_names, extra_ops)
        return pandas_model_inference(predictor_model, pandas_df, output_schema.fieldNames())

    inference = F.pandas_udf(func, output_schema, F.PandasUDFType.GROUPED_MAP)
    return df.groupby(F.spark_partition_id()).apply(inference)
예제 #17
0
from pyspark.sql import SparkSession
from lib.logger import Log4j
from pyspark.sql.functions import spark_partition_id

if __name__ == "__main__":
    conf = get_spark_app_config()
    spark = SparkSession.builder \
        .config(conf=conf) \
        .getOrCreate()

    logger = Log4j(spark)
    logger.info("Starting the pyspark application")

    logger.info("Reading data from csv files")
    movieCsvDf = load_movie_csv_df(spark)
    movieCsvDf.show(5)
    logger.info("Csv Schema: " + movieCsvDf.schema.simpleString())

    logger.info("Writing avro data to output path")
    logger.info("Number of partitions before :" +
                str(movieCsvDf.rdd.getNumPartitions()))
    movieCsvDf.groupBy(spark_partition_id()).count().show()

    partitionDf = movieCsvDf.repartition(5)
    logger.info("Number of partitions after :" +
                str(partitionDf.rdd.getNumPartitions()))
    partitionDf.groupBy(spark_partition_id()).count().show()
    write_movie_df(partitionDf)
    write_movie_json_df(partitionDf)
    logger.info("Completing the pyspark application")
예제 #18
0
    def to_redshift(
        self,
        dataframe: DataFrame,
        path: str,
        connection: Any,
        schema: str,
        table: str,
        iam_role: str,
        diststyle: str = "AUTO",
        distkey: Optional[str] = None,
        sortstyle: str = "COMPOUND",
        sortkey: Optional[str] = None,
        min_num_partitions: int = 200,
        mode: str = "append",
    ) -> None:
        """
        Load Spark Dataframe as a Table on Amazon Redshift

        :param dataframe: Pandas Dataframe
        :param path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/)
        :param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
        :param schema: The Redshift Schema for the table
        :param table: The name of the desired Redshift table
        :param iam_role: AWS IAM role with the related permissions
        :param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"] (https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html)
        :param distkey: Specifies a column name or positional number for the distribution key
        :param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED" (https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html)
        :param sortkey: List of columns to be sorted
        :param min_num_partitions: Minimal number of partitions
        :param mode: append or overwrite
        :return: None
        """
        logger.debug(f"Minimum number of partitions : {min_num_partitions}")
        if path[-1] != "/":
            path += "/"
        self._session.s3.delete_objects(path=path,
                                        procs_io_bound=self._procs_io_bound)
        spark: SparkSession = self._session.spark_session
        casts: Dict[str, str] = Spark._extract_casts(dataframe.dtypes)
        dataframe = Spark.date2timestamp(dataframe)
        dataframe.cache()
        num_rows: int = dataframe.count()
        logger.info(f"Number of rows: {num_rows}")
        num_partitions: int
        if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
            num_partitions = 1
        else:
            num_slices: int = self._session.redshift.get_number_of_slices(
                redshift_conn=connection)
            logger.debug(f"Number of slices on Redshift: {num_slices}")
            num_partitions = num_slices
            while num_partitions < min_num_partitions:
                num_partitions += num_slices
        logger.debug(f"Number of partitions calculated: {num_partitions}")
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        session_primitives = self._session.primitives
        par_col_name: str = "aws_data_wrangler_internal_partition_id"

        @pandas_udf(returnType="objects_paths string",
                    functionType=PandasUDFType.GROUPED_MAP)
        def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
            # Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
            # a temporary workaround while waiting for Apache Arrow updates
            # https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
            os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"

            del pandas_dataframe[par_col_name]
            paths: List[str] = session_primitives.session.pandas.to_parquet(
                dataframe=pandas_dataframe,
                path=path,
                preserve_index=False,
                mode="append",
                procs_cpu_bound=1,
                procs_io_bound=1,
                cast_columns=casts)
            return pd.DataFrame.from_dict({"objects_paths": paths})

        df_objects_paths: DataFrame = dataframe.repartition(
            numPartitions=num_partitions)  # type: ignore
        df_objects_paths: DataFrame = df_objects_paths.withColumn(
            par_col_name, spark_partition_id())  # type: ignore
        df_objects_paths: DataFrame = df_objects_paths.groupby(
            par_col_name).apply(write)  # type: ignore

        objects_paths: List[str] = list(
            df_objects_paths.toPandas()["objects_paths"])
        dataframe.unpersist()
        num_files_returned: int = len(objects_paths)
        if num_files_returned != num_partitions:
            raise MissingBatchDetected(
                f"{num_files_returned} files returned. {num_partitions} expected."
            )
        logger.debug(f"List of objects returned: {objects_paths}")
        logger.debug(
            f"Number of objects returned from UDF: {num_files_returned}")
        manifest_path: str = f"{path}manifest.json"
        self._session.redshift.write_load_manifest(
            manifest_path=manifest_path,
            objects_paths=objects_paths,
            procs_io_bound=self._procs_io_bound)
        self._session.redshift.load_table(dataframe=dataframe,
                                          dataframe_type="spark",
                                          manifest_path=manifest_path,
                                          schema_name=schema,
                                          table_name=table,
                                          redshift_conn=connection,
                                          preserve_index=False,
                                          num_files=num_partitions,
                                          iam_role=iam_role,
                                          diststyle=diststyle,
                                          distkey=distkey,
                                          sortstyle=sortstyle,
                                          sortkey=sortkey,
                                          mode=mode,
                                          cast_columns=casts)
        self._session.s3.delete_objects(path=path,
                                        procs_io_bound=self._procs_io_bound)
예제 #19
0
tot_Cnt =bizDF.count()
print("Total No of Rows: ", tot_Cnt)
unq_Cnt =bizDF.drop_duplicates().count()
print("Unique No of Rows: ", unq_Cnt)

# COMMAND ----------

# MAGIC %md ##### Check for Data Skewness

# COMMAND ----------

from pyspark.sql.functions import spark_partition_id
# get no of partitions
implictPart = bizDF.rdd.getNumPartitions()
print("Implict no of partitions:", implictPart)
#get each partition size
partitions =bizDF.withColumn("Partition_id", spark_partition_id()).groupBy("Partition_id").count().orderBy("Partition_id")

# COMMAND ----------

# MAGIC %md ##### Convert Json to Parquet

# COMMAND ----------

# MAGIC %md Since json is storage heavy and we are converting the raw data to parquet

# COMMAND ----------

outPath = "/mnt/preprocess_business"
bizDF.write.parquet(path=outPath, mode="overwrite",compression="snappy")
예제 #20
0
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("SparkSchemaDemo") \
        .getOrCreate()

    logger = Log4j(spark)

    flightTimeParquetDF = spark.read \
        .format("parquet") \
        .load("dataSource/flight*.parquet")

    logger.info("Num Partitions before: " +
                str(flightTimeParquetDF.rdd.getNumPartitions()))
    flightTimeParquetDF.groupBy(spark_partition_id()).count().show()

    partitionedDF = flightTimeParquetDF.repartition(5)
    logger.info("Num Partitions after: " +
                str(partitionedDF.rdd.getNumPartitions()))
    partitionedDF.groupBy(spark_partition_id()).count().show()

    partitionedDF.write \
        .format("avro") \
        .mode("overwrite") \
        .option("path", "dataSink/avro/") \
        .save()

    flightTimeParquetDF.write \
        .format("json") \
        .mode("overwrite") \
예제 #21
0
def test_part_id():
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, short_gen, num_slices=8).select(
            f.col('a'), f.spark_partition_id()))
def main():
    start_time = datetime.now()

    # copy gnaf tables to CSV
    pg_conn = psycopg2.connect(local_pg_connect_string)
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, state
                 FROM gnaf_202008.{}
             ) TO STDOUT WITH CSV"""
    # sql = """COPY (
    #              SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name,
    #                     lot_number, flat_number, level_number, number_first, number_last, street_name, street_type,
    #                     street_suffix, address, locality_name, postcode, state, locality_postcode, confidence,
    #                     legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability
    #              FROM gnaf_202008.{}
    #          ) TO STDOUT WITH CSV"""

    # address principals
    with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)
        # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file)

    # address aliases
    with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_conn.close()

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max",
                    cpu_count()).config("spark.sql.adaptive.enabled",
                                        "true").config("spark.driver.memory",
                                                       "8g").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # load gnaf points
    df = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .csv(input_file_name)
    # df.printSchema()
    # df.show()

    # # manually assign field types (not needed here as inferSchema works)
    # df2 = (df
    #        .withColumn("confidence", df.confidence.cast(t.ShortType()))
    #        .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType()))
    #        .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType()))
    #        .withColumn("reliability", df.reliability.cast(t.ShortType()))
    #        .withColumn("longitude", df.longitude.cast(t.DoubleType()))
    #        .withColumn("latitude", df.latitude.cast(t.DoubleType()))
    #        )
    # # df2.printSchema()
    # # df2.show()

    # add point geometries and partition by longitude into 400-500k row partitions
    gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)"))
    # .withColumnRenamed("gnaf_pid", "id")
    # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0))
    #             .cast(t.ShortType())) \
    # .repartitionByRange(100, "partition_id") \
    # gnaf_df.printSchema()

    # check partition counts
    gnaf_df.groupBy(f.spark_partition_id()).count().show()

    # write gnaf to gzipped parquet
    export_to_parquet(gnaf_df, "gnaf")

    # export PG boundary tables to parquet
    export_bdys(spark, "commonwealth_electorates", "ce_pid")
    export_bdys(spark, "local_government_areas", "lga_pid")
    export_bdys(spark, "local_government_wards", "ward_pid")
    export_bdys(spark, "state_lower_house_electorates", "se_lower_pid")
    export_bdys(spark, "state_upper_house_electorates", "se_upper_pid")

    # cleanup
    spark.stop()

    logger.info(
        "\t - GNAF and boundaries exported to gzipped parquet files: {}".
        format(datetime.now() - start_time))
예제 #23
0
    def _attach_distributed_sequence_column(sdf, column_name):
        """
        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.sort("sequence").show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """
        scols = [scol_for(sdf, column) for column in sdf.columns]

        spark_partition_column = verify_temp_column_name(
            sdf, "__spark_partition_id__")
        offset_column = verify_temp_column_name(sdf, "__offset__")
        row_number_column = verify_temp_column_name(sdf, "__row_number__")

        # 1. Calculates counts per each partition ID. `counts` here is, for instance,
        #     {
        #         1: 83,
        #         6: 83,
        #         3: 83,
        #         ...
        #     }
        sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id())

        # Checkpoint the DataFrame to fix the partition ID.
        sdf = sdf.localCheckpoint(eager=False)

        counts = map(
            lambda x: (x["key"], x["count"]),
            sdf.groupby(
                sdf[spark_partition_column].alias("key")).count().collect(),
        )

        # 2. Calculates cumulative sum in an order of partition id.
        #     Note that it does not matter if partition id guarantees its order or not.
        #     We just need a one-by-one sequential id.

        # sort by partition key.
        sorted_counts = sorted(counts, key=lambda x: x[0])
        # get cumulative sum in an order of partition key.
        cumulative_counts = [0] + list(
            accumulate(map(lambda count: count[1], sorted_counts)))
        # zip it with partition key.
        sums = dict(
            zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

        # 3. Attach offset for each partition.
        @pandas_udf(LongType(), PandasUDFType.SCALAR)
        def offset(id):
            current_partition_offset = sums[id.iloc[0]]
            return pd.Series(current_partition_offset).repeat(len(id))

        sdf = sdf.withColumn(offset_column, offset(spark_partition_column))

        # 4. Calculate row_number in each partition.
        w = Window.partitionBy(spark_partition_column).orderBy(
            F.monotonically_increasing_id())
        row_number = F.row_number().over(w)
        sdf = sdf.withColumn(row_number_column, row_number)

        # 5. Calculate the index.
        return sdf.select((sdf[offset_column] + sdf[row_number_column] -
                           1).alias(column_name), *scols)
예제 #24
0
def show_partition_id(df, col):
    return df.select(col, spark_partition_id().alias("partition_id")).show(1000, True)
예제 #25
0
def main():
    # Read SparkDriver.properties as ConfigParser() object
    appconfig = ConfigParser()
    appconfig.read(filenames='SparkDriver.properties')
    print(f'Properties file sections: {appconfig.sections()}')
    sanityChecks(config_file=appconfig)

    # Create Spark Session object
    spark = getSparkSessionObject(appconfig=appconfig)

    # Read XPATH Mappings given in Csv
    xpaths_mapping_df: DataFrame = readFromSource(
        spark=spark,
        opt={
            'location':
            f'{str(appconfig["Xml"]["XpathMappingsCsvFilePath"]).strip()}',
            'filetype': 'csv',
            'header': True,
            'inferSchema': True
        })

    # Using mapper module build spark sql queries from Xpath Mappings Csv
    spark_sql_query = buildQueriesFromXpath(df=xpaths_mapping_df)

    # You can also use other way by creating External Table building DDL using function buildDdlFromXpath
    # spark_sql_ddl = buildDdlFromXpath(appconfig=appconfig, df=xpaths_mapping_df)

    # Read actual huge multi-line XML file as XmlInputFormat determine row tag, eliminate new lines so that every
    # start and end tag comes in one line in a Dataframe
    xml_df: DataFrame = mapXmlAsHadoopFile(
        location=str(appconfig['Xml']['FileLocation']))

    # Determine revised partitions for Dataframe as XML data is skew
    total_records = xml_df.count()
    total_paritions = xml_df.rdd.getNumPartitions()
    total_records_per_partition = xml_df.groupBy(
        spark_partition_id()).count().select('count').collect()
    total_executors = int(
        spark.conf.get("spark.executor.instances", default="12").strip())
    total_cores = int(
        spark.conf.get("spark.executor.cores", default="3").strip())
    total_paritions_revised = total_cores * total_executors

    print(f"total_records = {total_records}")
    print(f"total_paritions = {total_paritions}")
    print(f"total_records_per_partition = {total_records_per_partition}")
    print(f"total_executors = {total_executors}")
    print(f"total_cores = {total_cores}")
    print(f"total_paritions_revised = {total_paritions_revised}")

    xml_df = xml_df.repartition(total_paritions_revised)

    # Execute the query in spark sql
    writedf: DataFrame = spark.sql(spark_sql_query['query'])

    # You can also use ddl and execute as spark SQL
    # spark.sql(spark_sql_ddl['ddl'])
    # spark.sql('LOAD DATA INPATH f"{str(appconfig['Xml']['FileLocation'])}" INTO xmltable')

    # Write data
    writedf.write.mode('overwrite').parquet(
        path=str(appconfig['Xml']['TargetWritePath']))
예제 #26
0
        'uniq_key',
        trim(
            regexp_replace(
                regexp_replace(
                    upper(
                        concat_ws('\x00', coalesce('LNAME', lit('')),
                                  coalesce('Address', lit('')))),
                    r'[^\x00\s\w]+', ''), r'\s+', ' '))))

    # tweak the number of repartitioning N based on realy data size
    N = 5

    # use dense_rank to calculate the in-partition idx
    w1 = Window.partitionBy('partition_id').orderBy('uniq_key')
    df1 = df.repartition(N, 'uniq_key') \
            .withColumn('partition_id', spark_partition_id()) \
            .withColumn('idx', dense_rank().over(w1))

    # get number of unique rows (based on Address+LNAME) which is max_idx
    # and then grab the running SUM of this cnt -> rcnt
    # partition_id: spark partition id
    # idx: calculated in-partition id
    # cnt: number of unique ids in the same partition fmax('idx')
    # rcnt: starting_id for a partition(something like a running count): coalesce(fsum('cnt').over(w1),lit(0))
    # w1: WindowSpec to calculate the above rcnt
    w2 = Window.partitionBy().orderBy('partition_id').rowsBetween(
        Window.unboundedPreceding, -1)

    df2 = df1.groupby('partition_id') \
             .agg(fmax('idx').alias('cnt')) \
             .withColumn('rcnt', coalesce(fsum('cnt').over(w2),lit(0)))
예제 #27
0
파일: accessors.py 프로젝트: Rasha27/koalas
    def apply_batch(self, func, args=(), **kwds):
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame["a": float, "b": float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ks.DataFrame["A": int, "B": int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           A  B
        0  1  2

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ks.DataFrame[int, int]:
        ...     return pdf ** y + z
        >>> df.koalas.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.koalas.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16

        >>> (df * -1).koalas.apply_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks import koalas as ks

        if not isinstance(func, types.FunctionType):
            assert callable(
                func), "the first argument should be a callable function."
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0"

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied = DataFrame(self._kdf._internal.resolved_copy)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied))
            kdf = ks.DataFrame(applied)
            if len(pdf) <= limit:
                return kdf

            return_schema = kdf._internal.to_internal_spark_frame.schema
            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=True)

            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.with_new_sdf(sdf)
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig)

            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=False)

            # Otherwise, it loses index.
            internal = InternalFrame(spark_frame=sdf, index_map=None)

        return DataFrame(internal)
예제 #28
0
# --------------------------------------------------------------------------------------------------

schema = StructType([
    StructField("id", IntegerType()),
    StructField("type", StringType()),
    StructField("quantity", IntegerType())
])

df = spark.read.option('header', 'true').csv(data_dir + 'dummy', schema=schema)
# --------------------------------------------------------------------------------------------------

# Partition
# TODO: foreachPartition
# TODO: sortWithinPartitions
df.rdd.getNumPartitions()
df.select('*', F.spark_partition_id().alias("pid")).show()

df.where(df.type == 'apple').show()  # where = filter

# Select
df.select('*', 'id').show()
df.select(f.concat(df.id, F.lit('-'), df.type).alias('s')).show()
df.select(f.format_string('%d-%s', df.id, df.type).alias('test_foramt')).show()
df.select(df.id.cast(StringType())).printSchema()
df.select(
    "*",
    F.when(df.quantity > 50,
           'High').when(df.quantity < 30,
                        'Low').otherwise('Medium').name('Q')).show()

# Filter
예제 #29
0
print("Total No of Rows: ", tot_Cnt)
unq_Cnt = userDF.drop_duplicates().count()
print("Unique No of Rows: ", unq_Cnt)

# COMMAND ----------

# MAGIC %md ##### Check for Data Skewness

# COMMAND ----------

from pyspark.sql.functions import spark_partition_id
# get no of partitions
implictPart = userDF.rdd.getNumPartitions()
print("Implict no of partitions:", implictPart)
#get each partition size
partitions = userDF.withColumn("Partition_id", spark_partition_id()).groupBy(
    "Partition_id").count().orderBy("Partition_id")
#partitions =userDF.withColumn("Partition_id", spark_partition_id())
#distPartitions = partitions.select("partition_id").distinct()
display(partitions)

# COMMAND ----------

# MAGIC %md Here we could see data are skweed resulting in the smaller size of partitions.

# COMMAND ----------

# MAGIC %md ##### Repartitioning the data to avoid data skewness

# COMMAND ----------
예제 #30
0
    def attach_default_index(sdf):
        """
        This method attaches a default index to Spark DataFrame. Spark does not have the index
        notion so corresponding column should be generated.

        There are three types of default index that can be controlled by `DEFAULT_INDEX`
        environment variable.

        - one-by-one: It implements an one-by-one sequence by Window function without
            specifying partition. Therefore, it ends up with whole partition in single node.
            This index type should be avoided when the data is large. This is default.

        - distributed-one-by-one: It implements an one-by-one sequence by group-by and
            group-map approach. It still generates a one-by-one sequential index globally.
            If the default index must be an one-by-one sequence in a large dataset, this
            index has to be used.
            Note that if more data are added to the data source after creating this index,
            then it does not guarantee the sequential index.

        - distributed: It implements a monotonically increasing sequence simply by using
            Spark's `monotonically_increasing_id` function. If the index does not have to be
            a one-by-one sequence, this index should be used. Performance-wise, this index
            almost does not have any penalty comparing to other index types.
            Note that we cannot use this type of index for combining two dataframes because
            it is not guaranteed to have the same indexes in two dataframes.

        """
        default_index_type = os.environ.get("DEFAULT_INDEX", "one-by-one")
        if default_index_type == "one-by-one":
            sequential_index = F.row_number().over(
                Window.orderBy(F.monotonically_increasing_id().asc())) - 1
            scols = [scol_for(sdf, column) for column in sdf.columns]
            return sdf.select(sequential_index.alias("__index_level_0__"),
                              *scols)
        elif default_index_type == "distributed-one-by-one":
            # 1. Calculates counts per each partition ID. `counts` here is, for instance,
            #     {
            #         1: 83,
            #         6: 83,
            #         3: 83,
            #         ...
            #     }
            counts = map(
                lambda x: (x["key"], x["count"]),
                sdf.groupby(
                    F.spark_partition_id().alias("key")).count().collect())

            # 2. Calculates cumulative sum in an order of partition id.
            #     Note that it does not matter if partition id guarantees its order or not.
            #     We just need a one-by-one sequential id.

            # sort by partition key.
            sorted_counts = sorted(counts, key=lambda x: x[0])
            # get cumulative sum in an order of partition key.
            cumulative_counts = accumulate(
                map(lambda count: count[1], sorted_counts))
            # zip it with partition key.
            sums = dict(
                zip(map(lambda count: count[0], sorted_counts),
                    cumulative_counts))

            # 3. Group by partition id and assign each range.
            def default_index(pdf):
                current_partition_max = sums[
                    pdf["__spark_partition_id"].iloc[0]]
                offset = len(pdf)
                pdf["__index_level_0__"] = list(
                    range(current_partition_max - offset,
                          current_partition_max))
                return pdf.drop(columns=["__spark_partition_id"])

            return_schema = StructType(
                [StructField("__index_level_0__", LongType())] +
                list(sdf.schema))
            grouped_map_func = pandas_udf(
                return_schema, PandasUDFType.GROUPED_MAP)(default_index)

            sdf = sdf.withColumn("__spark_partition_id",
                                 F.spark_partition_id())
            return sdf.groupBy("__spark_partition_id").apply(grouped_map_func)
        elif default_index_type == "distributed":
            scols = [scol_for(sdf, column) for column in sdf.columns]
            return sdf.select(
                F.monotonically_increasing_id().alias("__index_level_0__"),
                *scols)
        else:
            raise ValueError(
                "'DEFAULT_INDEX' environment variable should be one of 'one-by-one',"
                " 'distributed-one-by-one' and 'distributed'")