def test_classpath_script_works(): p = subprocess.Popen(["sagemakerpyspark-jars"], stdout=subprocess.PIPE) output, errors = p.communicate() jars = sagemaker_pyspark.classpath_jars() script_jars = output.decode('utf-8').split(",") assert len(jars) == len(script_jars) assert jars[0] == script_jars[0]
def test_classpath_script_can_use_separators(): p = subprocess.Popen("sagemakerpyspark-jars :".split(), stdout=subprocess.PIPE) output, errors = p.communicate() jars = sagemaker_pyspark.classpath_jars() script_jars = output.decode('utf-8').split(":") assert len(jars) == len(script_jars) assert jars[0] == script_jars[0]
def with_spark_context(): os.environ['SPARK_CLASSPATH'] = ":".join(classpath_jars()) conf = (SparkConf().set("spark.driver.extraClassPath", os.environ['SPARK_CLASSPATH'])) if SparkContext._active_spark_context is None: SparkContext(conf=conf) yield SparkContext._active_spark_context # TearDown SparkContext.stop(SparkContext._active_spark_context)
import os import boto3 from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession import sagemaker from sagemaker import get_execution_role import sagemaker_pyspark role = get_execution_role() # Configure Spark to use the SageMaker Spark dependency jars jars = sagemaker_pyspark.classpath_jars() classpath = ":".join(sagemaker_pyspark.classpath_jars()) # See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\ .master("local[*]").getOrCreate() spark import boto3 cn_regions = ['cn-north-1', 'cn-northwest-1'] region = boto3.Session().region_name endpoint_domain = 'com.cn' if region in cn_regions else 'com' spark._jsc.hadoopConfiguration().set( 'fs.s3a.endpoint', 's3.{}.amazonaws.{}'.format(region, endpoint_domain))