def dataTranform(self, dataInfo): featuresColm = dataInfo.get( PredictiveConstants.FEATURESCOLM) # featureColmList -replaced labelColm = dataInfo.get(PredictiveConstants.LABELCOLM) modelSheetName = dataInfo.get(PredictiveConstants.MODELSHEETNAME) modelId = dataInfo.get(PredictiveConstants.MODELID) storageLocation = dataInfo.get(PredictiveConstants.LOCATIONADDRESS) indexerPathMapping = {} oneHotEncoderPathMapping = {} self.labelColm = None if labelColm == None else labelColm self.featuresColm = None if featuresColm == None else featuresColm dataset = self.dataset vectorizedFeaturescolmName = modelSheetName + PredictiveConstants.DMXFEATURE dataset = dataset.drop(vectorizedFeaturescolmName) schemaData = dataset.schema if self.labelColm is not None: for labelName in self.labelColm: label = labelName else: label = self.labelColm nonNumericData = self.nonNumericToString(schemaData=schemaData, dataset=dataset) categoricalFeatures = nonNumericData.get( PredictiveConstants.CATEGORICALFEATURES) numericalFeatures = nonNumericData.get( PredictiveConstants.NUMERICALFEATURES) dataset = nonNumericData.get(PredictiveConstants.DATASET) schemaData = dataset.schema # indexing of label column isLabelIndexed = "no" if self.labelColm is not None: labelIndexedInfo = self.isLabelIndexed(schemaData=schemaData, label=label, dataset=dataset) dataset = labelIndexedInfo.get(PredictiveConstants.DATASET) isLabelIndexed = labelIndexedInfo.get( PredictiveConstants.ISLABELINDEXED) labelIndexer = labelIndexedInfo.get( PredictiveConstants.LABELINDEXER) # store the label indexer here. if labelIndexer is not None: labelIndexerStoragepath = storageLocation + modelId.upper( ) + label.upper() + PredictiveConstants.INDEXER.upper( ) + PredictiveConstants.PARQUETEXTENSION labelIndexer.save(labelIndexerStoragepath) #correct this indexerPathMapping.update({label: labelIndexerStoragepath}) oneHotEncodedFeaturesList = [] indexedFeatures = [] nonOneHotEncoded = [] for colm in categoricalFeatures: indexedColmName = PredictiveConstants.INDEXED_ + colm oneHotEncodedColName = PredictiveConstants.ONEHOTENCODED_ + colm indexer = StringIndexer(inputCol=colm, outputCol=indexedColmName, handleInvalid="skip").fit(dataset) dataset = indexer.transform(dataset) '''store the indexer here- saving mechanism should be modelid+colmName+indexer.parquet --> not storing the features indexer for now but keeping this for future use.''' featuresIndexerPath = storageLocation + modelId.upper( ) + colm.upper() + PredictiveConstants.INDEXER.upper( ) + PredictiveConstants.PARQUETEXTENSION indexer.write().overwrite().save(featuresIndexerPath) indexerPathMapping.update({colm: featuresIndexerPath}) rowNo = dataset.select(indexedColmName).distinct().count() '''Incase of only one category in the colm or more than one in case of training and prediction -- naming it with the onehotencoded colmName to avoid that uncertainity. ''' if (rowNo == 1): nonOneHotEncoded.append( oneHotEncodedColName ) # to avoid the problem when only single value colm is passed at the time of prediction. else: indexedFeatures.append(indexedColmName) oneHotEncodedFeaturesList.append(oneHotEncodedColName) oneHotEncoder = OneHotEncoderEstimator( inputCols=indexedFeatures, outputCols=oneHotEncodedFeaturesList, handleInvalid="error") oneHotEncoderPath = storageLocation + modelId.upper( ) + PredictiveConstants.ONEHOTENCODED.upper( ) + PredictiveConstants.PARQUETEXTENSION oneHotEncoder.write().overwrite().save(oneHotEncoderPath) oneHotEncoderPathMapping.update( {PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath}) oneHotEncoderFit = oneHotEncoder.fit(dataset) dataset = oneHotEncoderFit.transform(dataset) combinedFeatures = oneHotEncodedFeaturesList + numericalFeatures + nonOneHotEncoded categoryColmListDict = {} countOfCategoricalColmList = [] for value in categoricalFeatures: listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue self.numericalFeatures = numericalFeatures self.categoricalFeatures = categoricalFeatures if not categoricalFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) dataset = dataset.drop(vectorizedFeaturescolmName) featureassembler = VectorAssembler( inputCols=combinedFeatures, outputCol=vectorizedFeaturescolmName, handleInvalid="skip") dataset = featureassembler.transform(dataset) # retrieve the features colm name after onehotencoding indexOfFeatures = dataset.schema.names.index( vectorizedFeaturescolmName) oneHotEncodedFeaturesDict = dataset.schema.fields[ indexOfFeatures].metadata['ml_attr']['attrs'] idNameFeatures = {} if not oneHotEncodedFeaturesDict: idNameFeaturesOrderedTemp = None else: for type, value in oneHotEncodedFeaturesDict.items(): for subKey in value: idNameFeatures[subKey.get("idx")] = subKey.get("name") idNameFeaturesOrderedTemp = {} for key in sorted(idNameFeatures): idNameFeaturesOrderedTemp[key] = idNameFeatures[ key].replace(PredictiveConstants.ONEHOTENCODED_, "") idNameFeaturesOrdered = None if idNameFeaturesOrderedTemp == None else idNameFeaturesOrderedTemp # retrieve the label colm name only after label encoding indexedLabelNameDict = {} if isLabelIndexed == "yes": indexOfLabel = dataset.schema.names.index(label) indexedLabel = dataset.schema.fields[indexOfLabel].metadata[ "ml_attr"]["vals"] for value in indexedLabel: indexedLabelNameDict[indexedLabel.index(value)] = value # this code was for vector indexer since it is not stable for now from spark end # so will use it in future if needed. ''' vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset= vec_indexer.transform(dataset) ''' result = { PredictiveConstants.DATASET: dataset, PredictiveConstants.CATEGORICALFEATURES: categoricalFeatures, PredictiveConstants.NUMERICALFEATURES: numericalFeatures, PredictiveConstants.MAXCATEGORIES: maxCategories, PredictiveConstants.CATEGORYCOLMSTATS: categoryColmListDict, PredictiveConstants.INDEXEDFEATURES: indexedFeatures, PredictiveConstants.LABEL: label, PredictiveConstants.ONEHOTENCODEDFEATURESLIST: oneHotEncodedFeaturesList, PredictiveConstants.INDEXEDLABELNAMEDICT: indexedLabelNameDict, PredictiveConstants.ISLABELINDEXED: isLabelIndexed, PredictiveConstants.VECTORFEATURES: vectorizedFeaturescolmName, PredictiveConstants.IDNAMEFEATURESORDERED: idNameFeaturesOrdered, PredictiveConstants.INDEXERPATHMAPPING: indexerPathMapping, PredictiveConstants.ONEHOTENCODERPATHMAPPING: oneHotEncoderPathMapping } return result
def main(): spark = SparkSession \ .builder \ .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \ .getOrCreate() # use for only one file # filename = 'chicago_taxi_trips_2016_01.csv' # use for reading all files filename = '*' df = spark.read \ .format('csv') \ .options(header=True, inferSchema=True) \ .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename)) # df.printSchema() # Take a look at the top rows # df.limit(5).toPandas() # Check initial number of records # df.count() df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\ .withColumn('month', month(df.trip_start_timestamp))\ .withColumn('day', dayofmonth(df.trip_start_timestamp))\ .withColumn('hour', hour(df.trip_start_timestamp)) df_features = df_with_hour.select('year', 'month', 'day', 'hour', 'pickup_community_area', 'dropoff_community_area') df_no_nulls = df_features.dropna() # df_no_nulls.count() # Create StringIndexer and fit + transform pickup data pickup_indexer = StringIndexer(inputCol='pickup_community_area', outputCol='pickup_community_area_indexed') pickup_indexer_model = pickup_indexer.fit(df_no_nulls) df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls) # Create StringIndexer and fit + transform dropoff data dropoff_indexer = StringIndexer(inputCol='dropoff_community_area', outputCol='dropoff_community_area_indexed') dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed) df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed) # Create OneHotEncoder and fit + transform pickup & dropoff data encoder = OneHotEncoderEstimator() \ .setInputCols(['hour', 'pickup_community_area_indexed', 'dropoff_community_area_indexed']) \ .setOutputCols(['hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded']) encoder_model = encoder.fit(df_dropoff_indexed) df_encoded = encoder_model.transform(df_dropoff_indexed) # df_encoded.printSchema() bucket = output_conf['s3_bucket'] key = output_conf['s3_model_key'] # save the pickup stringINdexer and model pickup_indexer_name = 'pickup_indexer_name' pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name) pickup_indexer.write().overwrite().save(pickup_indexer_path) pickup_indexer_model_name = 'pickup_indexer_model_name' pickup_indexer_model_name_path = os.path.join(bucket, key, pickup_indexer_model_name) pickup_indexer_model \ .write() \ .overwrite() \ .save(pickup_indexer_model_name_path) # save the dropoff stringINdexer and model dropoff_indexer_name = 'dropoff_indexer_name' dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name) dropoff_indexer.write().overwrite().save(dropoff_indexer_path) dropoff_indexer_model_name = 'dropoff_indexer_model_name' dropoff_indexer_model_name_path = os.path.join(bucket, key, dropoff_indexer_model_name) dropoff_indexer_model \ .write() \ .overwrite() \ .save(dropoff_indexer_model_name_path) # save the one-hot encoder and model encoder_name = 'encoder_name' encoder_name_path = os.path.join(bucket, key, encoder_name) encoder.write().overwrite().save(encoder_name_path) encoder_model_name = 'encoder_model_name' encoder_model_name_path = os.path.join(bucket, key, encoder_model_name) encoder_model.write().overwrite().save(encoder_model_name_path) # make final dataframe and store back to S3 df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded') bucket = output_conf['s3_bucket'] key = output_conf['s3_data_key'] output_path = os.path.join(bucket, key) df_final.write.partitionBy('year', 'month', 'day') \ .parquet(output_path, mode='overwrite')