Exemplo n.º 1
0
def add_packaged_environment(ssb: SparkSession.Builder, archive: str):
    if archive.endswith('pex'):
        _add_or_merge(ssb, "spark.yarn.dist.files", f"{archive}")
        ssb.config("spark.executorEnv.PEX_ROOT", "./.pex")
        os.environ['PYSPARK_PYTHON'] = './' + archive.split('/')[-1]
        os.environ['PYSPARK_DRIVER_PYTHON'] = archive.split('/')[-1]
    else:
        _add_archive(ssb, f"{archive}#condaenv")
        os.environ['PYSPARK_PYTHON'] = f"./condaenv/bin/python"
        os.environ['PYSPARK_DRIVER_PYTHON'] = 'python'
Exemplo n.º 2
0
def configure_spark_with_delta_pip(
        spark_session_builder: SparkSession.Builder) -> SparkSession.Builder:
    """
    Utility function to configure a SparkSession builder such that the generated SparkSession
    will automatically download the required Delta Lake JARs from Maven. This function is
    required when you want to

    1. Install Delta Lake locally using pip, and

    2. Execute your Python code using Delta Lake + Pyspark directly, that is, not using
       `spark-submit --packages io.delta:...` or `pyspark --packages io.delta:...`.

        builder = SparkSession.builder \
            .master("local[*]") \
            .appName("test")

        spark = configure_spark_with_delta_pip(builder).getOrCreate()

    :param spark_session_builder: SparkSession.Builder object being used to configure and
                                  create a SparkSession.
    :return: Updated SparkSession.Builder object

    .. versionadded:: 1.0

    .. note:: Evolving
    """
    import importlib_metadata  # load this library only when this function is called

    if type(spark_session_builder) is not SparkSession.Builder:
        msg = f'''
This function must be called with a SparkSession builder as the argument.
The argument found is of type {str(type(spark_session_builder))}.
See the online documentation for the correct usage of this function.
        '''
        raise TypeError(msg)

    try:
        delta_version = importlib_metadata.version("delta_spark")
    except Exception as e:
        msg = '''
This function can be used only when Delta Lake has been locally installed with pip.
See the online documentation for the correct usage of this function.
        '''
        raise Exception(msg) from e

    scala_version = "2.12"
    maven_artifact = f"io.delta:delta-core_{scala_version}:{delta_version}"

    return spark_session_builder.config("spark.jars.packages", maven_artifact)
Exemplo n.º 3
0
def main(argv: List[str],
         cwd: Path_T,
         builder: SparkSession_T.Builder,
         driver_memory: str = '16g') -> None:
    [stats_fn, ddict_dir, out_fn] = argv[1:4]

    spark = (builder.appName(__file__).config('driver-memory',
                                              driver_memory).getOrCreate())

    ndd = DataDictionary.make_in(spark, cwd / ddict_dir)
    job = SyntheticData(spark)
    stats = spark.read.csv(stats_fn, header=True, inferSchema=True)
    records = job.synthesize_data(stats, ndd.record_layout)
    records.to_pickle(',syn_records_TMP.pkl')
    with (cwd / out_fn).open('w') as out:
        for line in flat_file.naaccr_make_fwf(records,
                                              ndd.record_layout.toPandas()):
            out.write(line)
Exemplo n.º 4
0
def main(argv: List[str],
         cwd: Path_T,
         builder: SparkSession_T.Builder,
         driver_memory: str = '16g') -> None:
    [naaccr_file, sample_, ddict_dir, stats_out] = argv[1:5]
    sample = int(sample_)
    spark = (builder.appName(__file__).config('driver-memory',
                                              driver_memory).getOrCreate())

    ndd = DataDictionary.make_in(spark, cwd / ddict_dir)
    data_raw = naaccr_read_fwf(
        spark.read.text(str(cwd / naaccr_file)).sample(False, sample / 100),
        ndd.record_layout,
    )
    data_raw = data_raw
    stats = DataSummary.nominal_stats(data_raw, spark, ndd)

    stats.to_csv(cwd / stats_out)
    print(stats.head(10))
Exemplo n.º 5
0
def configure_spark_with_delta_pip(
        spark_session_builder: SparkSession.Builder,
        extra_packages: Optional[List[str]] = None) -> SparkSession.Builder:
    """
    Utility function to configure a SparkSession builder such that the generated SparkSession
    will automatically download the required Delta Lake JARs from Maven. This function is
    required when you want to

    1. Install Delta Lake locally using pip, and

    2. Execute your Python code using Delta Lake + Pyspark directly, that is, not using
       `spark-submit --packages io.delta:...` or `pyspark --packages io.delta:...`.

        builder = SparkSession.builder \
            .master("local[*]") \
            .appName("test")

        spark = configure_spark_with_delta_pip(builder).getOrCreate()

    3. If you would like to add more packages, use the `extra_packages` parameter.

        builder = SparkSession.builder \
            .master("local[*]") \
            .appName("test")
        my_packages = ["org.apache.spark:spark-sql-kafka-0-10_2.12:x.y.z"]
        spark = configure_spark_with_delta_pip(builder, extra_packages=my_packages).getOrCreate()

    :param spark_session_builder: SparkSession.Builder object being used to configure and
                                  create a SparkSession.
    :param extra_packages: Set other packages to add to Spark session besides Delta Lake.
    :return: Updated SparkSession.Builder object

    .. versionadded:: 1.0

    .. note:: Evolving
    """
    import importlib_metadata  # load this library only when this function is called

    if type(spark_session_builder) is not SparkSession.Builder:
        msg = f'''
This function must be called with a SparkSession builder as the argument.
The argument found is of type {str(type(spark_session_builder))}.
See the online documentation for the correct usage of this function.
        '''
        raise TypeError(msg)

    try:
        delta_version = importlib_metadata.version("delta_spark")
    except Exception as e:
        msg = '''
This function can be used only when Delta Lake has been locally installed with pip.
See the online documentation for the correct usage of this function.
        '''
        raise Exception(msg) from e

    scala_version = "2.12"
    maven_artifact = f"io.delta:delta-core_{scala_version}:{delta_version}"

    extra_packages = extra_packages if extra_packages is not None else []
    all_artifacts = [maven_artifact] + extra_packages
    packages_str = ",".join(all_artifacts)

    return spark_session_builder.config("spark.jars.packages", packages_str)
Exemplo n.º 6
0
def _add_or_merge(ssb: SparkSession.Builder, key: str, value: str):
    if key in ssb._options:
        old_value = ssb._options[key]
        ssb.config(key, f"{old_value},{value}")
    else:
        ssb.config(key, value)
 def createSession(builder:SparkSession.Builder) -> SparkSession:
     if isinstance(builder, SparkSession.Builder):
         return builder.getOrCreate()