def test_serialize_deserialize_math_binary(self): add_transformer = self._new_add_math_binary() file_path = '{}{}'.format( 'jar:file:', os.path.join(self.tmp_dir, 'math_binary.zip')) add_transformer.serializeToBundle(file_path, self.input) deserialized_math_binary = SimpleSparkSerializer( ).deserializeFromBundle(file_path) result = deserialized_math_binary.transform( self.input).toPandas()[['add(f1, f2)']] assert_frame_equal(self.expected_add, result)
def _serialize_to_file(path, df_for_serializing, model): if os.path.exists(path): os.remove(path) path_dir = os.path.dirname(path) if not os.path.exists(path_dir): os.makedirs(path_dir) SimpleSparkSerializer().serializeToBundle(model, _to_file_path(path), df_for_serializing)
def test_serialize_deserialize_pipeline(self): add_transformer = self._new_add_math_binary() mul_transformer = MathBinary( operation=BinaryOperation.Multiply, inputA="f1", inputB="add(f1, f2)", outputCol="mul(f1, add(f1, f2))", ) expected = pd.DataFrame( [(float(i * (i + i * 2))) for i in range(1, 10)], columns=['mul(f1, add(f1, f2))'], ) pipeline = Pipeline(stages=[add_transformer, mul_transformer]) pipeline_model = pipeline.fit(self.input) file_path = '{}{}'.format( 'jar:file:', os.path.join(self.tmp_dir, 'math_binary_pipeline.zip')) pipeline_model.serializeToBundle(file_path, self.input) deserialized_pipeline = SimpleSparkSerializer().deserializeFromBundle( file_path) result = pipeline_model.transform( self.input).toPandas()[['mul(f1, add(f1, f2))']] assert_frame_equal(expected, result)
def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True)]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')), header=False, schema=schema, encoding='UTF-8') validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')), header=False, schema=schema, encoding='UTF-8') # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def main(): # Initialize Spark session and variables spark = SparkSession.builder.appName("PySparkAbalone").getOrCreate() args = getResolvedOptions(sys.argv, [ 'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX' ]) # Save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set( "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. schema = StructType([ StructField("sex", StringType(), True), StructField("length", DoubleType(), True), StructField("diameter", DoubleType(), True), StructField("height", DoubleType(), True), StructField("whole_weight", DoubleType(), True), StructField("shucked_weight", DoubleType(), True), StructField("viscera_weight", DoubleType(), True), StructField("shell_weight", DoubleType(), True), StructField("rings", DoubleType(), True) ]) # Downloading the data from S3 into a Dataframe s3_path = 's3://' + os.path.join( args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'abalone.csv') total_df = spark.read.csv(s3_path, header=False, schema=schema) # Build a feature preprocessing pipeline for categorical values, one-hot-encoding and vectorization cols = [ "sex_vec", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight" ] pipeline = Pipeline(stages=[ StringIndexer(inputCol='sex', outputCol='indexed_sex'), OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec"), VectorAssembler(inputCols=cols, outputCol="features") ]) # Fit the data to our pipeline and split into training and validation sets etl = pipeline.fit(total_df) transformed_total_df = etl.transform(total_df) train_df, val_df = transformed_total_df.randomSplit([0.8, 0.2]) # Convert train and val sets into RDD, save as CSV and upload to S3 for df, name in [(train_df, 'train'), (val_df, 'valid')]: rdd = df.rdd.map(lambda x: (x.rings, x.features)).map(csv_line) rdd.saveAsTextFile('s3://' + os.path.join( args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], name)) # Serialize ETL pipeline, convert into tar.gz file and store binary using MLeap SimpleSparkSerializer().serializeToBundle(etl, "jar:file:/tmp/model.zip", val_df) with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') # Upload the ETL pipeline in tar.gz format to S3 so that it can be used with SageMaker for inference later s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def main(): spark = SparkSession.builder.appName("PySparkTitanic").getOrCreate() args = getResolvedOptions( sys.argv, [ "s3_input_data_location", "s3_output_bucket", "s3_output_bucket_prefix", "s3_model_bucket", "s3_model_bucket_prefix", ], ) # This is needed to write RDDs to file which is the only way to write nested Dataframes into CSV. spark.sparkContext._jsc.hadoopConfiguration().set( "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") train = spark.read.csv(args["s3_input_data_location"], header=False) oldColumns = train.schema.names newColumns = [ "buying", "maint", "doors", "persons", "lug_boot", "safety", "cat" ] train = reduce( lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[ idx]), xrange(len(oldColumns)), train, ) # dropping null values train = train.dropna() # Target label catIndexer = StringIndexer(inputCol="cat", outputCol="label") labelIndexModel = catIndexer.fit(train) train = labelIndexModel.transform(train) converter = IndexToString(inputCol="label", outputCol="cat") # Spliting in train and test set. Beware : It sorts the dataset (traindf, validationdf) = train.randomSplit([0.8, 0.2]) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. buyingIndexer = StringIndexer(inputCol="buying", outputCol="indexedBuying") maintIndexer = StringIndexer(inputCol="maint", outputCol="indexedMaint") doorsIndexer = StringIndexer(inputCol="doors", outputCol="indexedDoors") personsIndexer = StringIndexer(inputCol="persons", outputCol="indexedPersons") lug_bootIndexer = StringIndexer(inputCol="lug_boot", outputCol="indexedLug_boot") safetyIndexer = StringIndexer(inputCol="safety", outputCol="indexedSafety") # One Hot Encoder on indexed features buyingEncoder = OneHotEncoder(inputCol="indexedBuying", outputCol="buyingVec") maintEncoder = OneHotEncoder(inputCol="indexedMaint", outputCol="maintVec") doorsEncoder = OneHotEncoder(inputCol="indexedDoors", outputCol="doorsVec") personsEncoder = OneHotEncoder(inputCol="indexedPersons", outputCol="personsVec") lug_bootEncoder = OneHotEncoder(inputCol="indexedLug_boot", outputCol="lug_bootVec") safetyEncoder = OneHotEncoder(inputCol="indexedSafety", outputCol="safetyVec") # Create the vector structured data (label,features(vector)) assembler = VectorAssembler( inputCols=[ "buyingVec", "maintVec", "doorsVec", "personsVec", "lug_bootVec", "safetyVec" ], outputCol="features", ) # Chain featurizers in a Pipeline pipeline = Pipeline(stages=[ buyingIndexer, maintIndexer, doorsIndexer, personsIndexer, lug_bootIndexer, safetyIndexer, buyingEncoder, maintEncoder, doorsEncoder, personsEncoder, lug_bootEncoder, safetyEncoder, assembler, ]) # Train model. This also runs the indexers. model = pipeline.fit(traindf) # Delete previous data from output s3 = boto3.resource("s3") bucket = s3.Bucket(args["s3_output_bucket"]) bucket.objects.filter(Prefix=args["s3_output_bucket_prefix"]).delete() # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = model.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile("s3a://" + args["s3_output_bucket"] + "/" + args["s3_output_bucket_prefix"] + "/" + "train") # Similar data processing for validation dataset. predictions = model.transform(validationdf) transformed_train_rdd = predictions.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile("s3a://" + args["s3_output_bucket"] + "/" + args["s3_output_bucket_prefix"] + "/" + "validation") # Serialize and store via MLeap SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", predictions) # Unzipping as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname="bundle.json") tar.add("/tmp/model/root", arcname="root") s3 = boto3.resource("s3") file_name = args["s3_model_bucket_prefix"] + "/" + "model.tar.gz" s3.Bucket(args["s3_model_bucket"]).upload_file("/tmp/model.tar.gz", file_name) os.remove("/tmp/model.zip") os.remove("/tmp/model.tar.gz") shutil.rmtree("/tmp/model") # Save postprocessor SimpleSparkSerializer().serializeToBundle(converter, "jar:file:/tmp/postprocess.zip", predictions) with zipfile.ZipFile("/tmp/postprocess.zip") as zf: zf.extractall("/tmp/postprocess") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/postprocess.tar.gz", "w:gz") as tar: tar.add("/tmp/postprocess/bundle.json", arcname="bundle.json") tar.add("/tmp/postprocess/root", arcname="root") file_name = args["s3_model_bucket_prefix"] + "/" + "postprocess.tar.gz" s3.Bucket(args["s3_model_bucket"]).upload_file("/tmp/postprocess.tar.gz", file_name) os.remove("/tmp/postprocess.zip") os.remove("/tmp/postprocess.tar.gz") shutil.rmtree("/tmp/postprocess")
args['S3_BUCKET'])) logger.info('Save train file completed.') logger.info('Save validation file started...') # Convert the validation dataframe to RDD to save in CSV format and upload to S3 validation_rdd = validation_df.rdd.map(lambda x: (x.indexed_breakdown, x.features)) validation_lines = validation_rdd.map(csv_line) validation_lines.saveAsTextFile('s3://{0}/data/preprocessed/val'.format( args['S3_BUCKET'])) logger.info('Save validation file completed.') # Serialize and store the model via MLeap timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) model_filename = '/tmp/model-' + timestamp + '.zip' SimpleSparkSerializer().serializeToBundle(model, 'jar:file:' + model_filename, df) # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file with zipfile.ZipFile(model_filename) as zf: zf.extractall("/tmp/model-" + timestamp) # Write back the content as a .tar.gz file with tarfile.open("/tmp/model-" + timestamp + ".tar.gz", "w:gz") as tar: tar.add("/tmp/model-" + timestamp + "/bundle.json", arcname='bundle.json') tar.add("/tmp/model-" + timestamp + "/root", arcname='root') # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later s3 = boto3.resource('s3') s3.Bucket(args['S3_BUCKET']).upload_file('/tmp/model-' + timestamp + '.tar.gz', 'output/sparkml/model.tar.gz')
def main(): spark = SparkSession.builder.appName("PySparkAbalone").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("sex", StringType(), True), StructField("length", DoubleType(), True), StructField("diameter", DoubleType(), True), StructField("height", DoubleType(), True), StructField("whole_weight", DoubleType(), True), StructField("shucked_weight", DoubleType(), True), StructField("viscera_weight", DoubleType(), True), StructField("shell_weight", DoubleType(), True), StructField("rings", DoubleType(), True)]) # Downloading the data from S3 into a Dataframe total_df = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'abalone.csv')), header=False, schema=schema) #StringIndexer on the sex column which has categorical value sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex") #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex) sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec") #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format assembler = VectorAssembler(inputCols=["sex_vec", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"], outputCol="features") # The pipeline comprises of the steps added above pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler]) # This step trains the feature transformers. We need to serialize this model with MLeap and save to S3 model = pipeline.fit(total_df) # This step transforms the dataset with information obtained from the previous fit transformed_total_df = model.transform(total_df) # Split the overall dataset into 80-20 training and validation (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2]) # Convert the train dataframe to RDD to save in CSV format and upload to S3 train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features)) train_lines = train_rdd.map(csv_line) train_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Convert the validation dataframe to RDD to save in CSV format and upload to S3 validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features)) validation_lines = validation_rdd.map(csv_line) validation_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize and store the model via MLeap SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", validation_df) # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writw back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def _deserialize_from_file(path): return SimpleSparkSerializer().deserializeFromBundle(_to_file_path(path))
def _serialize_to_file(model, df_for_serializing): jar_file_path = _to_jar_file_path( os.path.join(tempfile.mkdtemp(), 'test_serialize_to_bundle-pipeline.zip')) SimpleSparkSerializer().serializeToBundle(model, jar_file_path, df_for_serializing) return jar_file_path
def main(): spark = SparkSession.builder.appName("AbcHeadlinesSpark").getOrCreate() #getResolvedOptions (args, options=argument names that you want to retrieve) gives you access to the arguments that are passed to the SparkML script when running a job args = getResolvedOptions(sys.argv, [ 'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_INPUT_FILENAME', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX' ]) #Read the compressed text file containing enron emails encoded as table containing docID, wordID, and count abcnewsdf = spark.read.option("header", "true").csv( ('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], args['S3_INPUT_FILENAME']))) #Filter number of abc news headlines #1,103,663 - headlines hdl_cnt = abcnewsdf.count() #Filter the number of headlines hdl_fil_cnt = hdl_cnt * .1 hdl_fil_cnt = int(hdl_fil_cnt) abcnewsdf = abcnewsdf.limit(hdl_fil_cnt) #Create features from text #Tokenizer tok = Tokenizer(inputCol="headline_text", outputCol="words") # stop words swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Term frequency ctv = CountVectorizer(inputCol="filtered", outputCol="tf", vocabSize=200, minDF=2) #Term frequency is weighted by number of times the word appears across all docs in corpus # Words that are unique to a headline have more weight - since they define the headline idf = IDF(inputCol="tf", outputCol="features") # Build the pipeline news_pl = Pipeline(stages=[tok, swr, ctv, idf]) #Transformed dataset news_pl_fit = news_pl.fit(abcnewsdf) news_ftrs_df = news_pl_fit.transform(abcnewsdf) gen_str_udf = F.udf(gen_str, StringType()) #Convert Sparse vector to Dense vector news_formatted = news_ftrs_df.withColumn( 'result', gen_str_udf(news_ftrs_df.features)) #Save the Dense vector to csv file news_save = news_formatted.select("result") news_save.write.option("delimiter", "\t").mode("append").csv( 's3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'])) #Save the vocabulary file vocab_list = news_pl_fit.stages[2].vocabulary vocab_df = spark.createDataFrame(vocab_list, StringType()) vocab_df = vocab_df.coalesce(1) vocab_df.write.option( "delimiter", "\n").format("text").mode("append").save('s3://' + os.path.join( args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'])) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(news_pl_fit, "jar:file:/tmp/model.zip", news_ftrs_df) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def main(): spark = SparkSession.builder.appName("churn-analytics").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers callstats_schema = StructType([StructField('State', StringType(), True), StructField('AccountLength', IntegerType(), True), StructField('AreaCode', IntegerType(), True), StructField('Phone', StringType(), True), StructField('IntlPlan', StringType(), True), StructField('VMailPlan', StringType(), True), StructField('VMailMessage', IntegerType(), True), StructField('DayMins', FloatType(), True), StructField('DayCalls', IntegerType(), True), StructField('DayCharge', FloatType(), True), StructField('EveMins', FloatType(), True), StructField('EveCalls', IntegerType(), True), StructField('EveCharge', FloatType(), True), StructField('NightMins', FloatType(), True), StructField('NightCalls', IntegerType(), True), StructField('NightCharge', FloatType(), True), StructField('IntlMins', FloatType(), True), StructField('IntlCalls', IntegerType(), True), StructField('IntlCharge', FloatType(), True), StructField('CustServCalls', IntegerType(), True), StructField('Churn?', StringType(), True)]) # Downloading the data from S3 into a Dataframe raw_df = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'churn.csv')), header=True, schema=callstats_schema) categoricalColumns = ["State", "AreaCode", "IntlPlan", "VMailPlan"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns : idxName = categoricalCol+"Idx" stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=idxName) catVec = categoricalCol+"Vec" encoder = OneHotEncoder(inputCol=idxName, outputCol=catVec, dropLast=False) stages += [stringIndexer, encoder] numericCols = ["AccountLength","VMailMessage","DayMins","DayCalls","EveMins","EveCalls","NightMins", "NightCalls","IntlMins","IntlCalls", "CustServCalls"] assemblerInputs = numericCols+[c + "Vec" for c in categoricalColumns] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] mlPipeline = Pipeline().setStages(stages) pipelineModel = mlPipeline.fit(raw_df) dataset = pipelineModel.transform(raw_df).select('*',col('Churn?').contains('True').cast('integer').alias('labels')) # Split the overall dataset into 80-20 training and validation (train_df, test_df) = dataset.randomSplit([0.8, 0.2]) # Convert the train dataframe to RDD to save in CSV format and upload to S3 train_rdd = train_df.rdd.map(lambda r: (r.labels, r.features)) train_lines = train_rdd.map(csv_line) train_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Convert the validation dataframe to RDD to save in CSV format and upload to S3 test_rdd = test_df.rdd.map(lambda r: (r.labels, r.features)) test_lines = test_rdd.map(csv_line) test_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'test')) # Serialize and store the model via MLeap SimpleSparkSerializer().serializeToBundle(pipelineModel, "jar:file:/tmp/model.zip", test_df) # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writw back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)