def dataTranform(self, dataInfo):
        featuresColm = dataInfo.get(
            PredictiveConstants.FEATURESCOLM)  # featureColmList -replaced
        labelColm = dataInfo.get(PredictiveConstants.LABELCOLM)
        modelSheetName = dataInfo.get(PredictiveConstants.MODELSHEETNAME)
        modelId = dataInfo.get(PredictiveConstants.MODELID)
        storageLocation = dataInfo.get(PredictiveConstants.LOCATIONADDRESS)

        indexerPathMapping = {}
        oneHotEncoderPathMapping = {}

        self.labelColm = None if labelColm == None else labelColm
        self.featuresColm = None if featuresColm == None else featuresColm
        dataset = self.dataset
        vectorizedFeaturescolmName = modelSheetName + PredictiveConstants.DMXFEATURE
        dataset = dataset.drop(vectorizedFeaturescolmName)

        schemaData = dataset.schema

        if self.labelColm is not None:
            for labelName in self.labelColm:
                label = labelName
        else:
            label = self.labelColm

        nonNumericData = self.nonNumericToString(schemaData=schemaData,
                                                 dataset=dataset)
        categoricalFeatures = nonNumericData.get(
            PredictiveConstants.CATEGORICALFEATURES)
        numericalFeatures = nonNumericData.get(
            PredictiveConstants.NUMERICALFEATURES)
        dataset = nonNumericData.get(PredictiveConstants.DATASET)
        schemaData = dataset.schema

        # indexing of label column
        isLabelIndexed = "no"
        if self.labelColm is not None:
            labelIndexedInfo = self.isLabelIndexed(schemaData=schemaData,
                                                   label=label,
                                                   dataset=dataset)
            dataset = labelIndexedInfo.get(PredictiveConstants.DATASET)
            isLabelIndexed = labelIndexedInfo.get(
                PredictiveConstants.ISLABELINDEXED)
            labelIndexer = labelIndexedInfo.get(
                PredictiveConstants.LABELINDEXER)
            # store the label indexer here.
            if labelIndexer is not None:
                labelIndexerStoragepath = storageLocation + modelId.upper(
                ) + label.upper() + PredictiveConstants.INDEXER.upper(
                ) + PredictiveConstants.PARQUETEXTENSION
                labelIndexer.save(labelIndexerStoragepath)  #correct this
                indexerPathMapping.update({label: labelIndexerStoragepath})

        oneHotEncodedFeaturesList = []
        indexedFeatures = []
        nonOneHotEncoded = []
        for colm in categoricalFeatures:
            indexedColmName = PredictiveConstants.INDEXED_ + colm
            oneHotEncodedColName = PredictiveConstants.ONEHOTENCODED_ + colm
            indexer = StringIndexer(inputCol=colm,
                                    outputCol=indexedColmName,
                                    handleInvalid="skip").fit(dataset)
            dataset = indexer.transform(dataset)
            '''store the indexer here- saving mechanism should be modelid+colmName+indexer.parquet -->
            not storing the features indexer for now but keeping this for future use.'''
            featuresIndexerPath = storageLocation + modelId.upper(
            ) + colm.upper() + PredictiveConstants.INDEXER.upper(
            ) + PredictiveConstants.PARQUETEXTENSION
            indexer.write().overwrite().save(featuresIndexerPath)
            indexerPathMapping.update({colm: featuresIndexerPath})
            rowNo = dataset.select(indexedColmName).distinct().count()
            '''Incase of only one category in the colm or more than one in case of training and prediction
            -- naming it with the onehotencoded colmName to avoid that uncertainity. '''
            if (rowNo == 1):
                nonOneHotEncoded.append(
                    oneHotEncodedColName
                )  # to avoid the problem when only single value colm is passed at the time of prediction.
            else:
                indexedFeatures.append(indexedColmName)
                oneHotEncodedFeaturesList.append(oneHotEncodedColName)

        oneHotEncoder = OneHotEncoderEstimator(
            inputCols=indexedFeatures,
            outputCols=oneHotEncodedFeaturesList,
            handleInvalid="error")
        oneHotEncoderPath = storageLocation + modelId.upper(
        ) + PredictiveConstants.ONEHOTENCODED.upper(
        ) + PredictiveConstants.PARQUETEXTENSION
        oneHotEncoder.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update(
            {PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath})
        oneHotEncoderFit = oneHotEncoder.fit(dataset)
        dataset = oneHotEncoderFit.transform(dataset)

        combinedFeatures = oneHotEncodedFeaturesList + numericalFeatures + nonOneHotEncoded
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in categoricalFeatures:
            listValue = []
            categoryColm = dataset.groupby(value).count()
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue

        self.numericalFeatures = numericalFeatures
        self.categoricalFeatures = categoricalFeatures
        if not categoricalFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)
        dataset = dataset.drop(vectorizedFeaturescolmName)
        featureassembler = VectorAssembler(
            inputCols=combinedFeatures,
            outputCol=vectorizedFeaturescolmName,
            handleInvalid="skip")
        dataset = featureassembler.transform(dataset)

        # retrieve the features colm name after onehotencoding
        indexOfFeatures = dataset.schema.names.index(
            vectorizedFeaturescolmName)
        oneHotEncodedFeaturesDict = dataset.schema.fields[
            indexOfFeatures].metadata['ml_attr']['attrs']
        idNameFeatures = {}

        if not oneHotEncodedFeaturesDict:
            idNameFeaturesOrderedTemp = None
        else:
            for type, value in oneHotEncodedFeaturesDict.items():
                for subKey in value:
                    idNameFeatures[subKey.get("idx")] = subKey.get("name")
                    idNameFeaturesOrderedTemp = {}
                    for key in sorted(idNameFeatures):
                        idNameFeaturesOrderedTemp[key] = idNameFeatures[
                            key].replace(PredictiveConstants.ONEHOTENCODED_,
                                         "")

        idNameFeaturesOrdered = None if idNameFeaturesOrderedTemp == None else idNameFeaturesOrderedTemp

        # retrieve the label colm name only after label encoding
        indexedLabelNameDict = {}
        if isLabelIndexed == "yes":
            indexOfLabel = dataset.schema.names.index(label)
            indexedLabel = dataset.schema.fields[indexOfLabel].metadata[
                "ml_attr"]["vals"]

            for value in indexedLabel:
                indexedLabelNameDict[indexedLabel.index(value)] = value

        # this code was for vector indexer since it is not stable for now from spark end
        # so will use it in future if needed.
        '''
        vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features',
        maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))
        dataset= vec_indexer.transform(dataset)
        '''

        result = {
            PredictiveConstants.DATASET: dataset,
            PredictiveConstants.CATEGORICALFEATURES: categoricalFeatures,
            PredictiveConstants.NUMERICALFEATURES: numericalFeatures,
            PredictiveConstants.MAXCATEGORIES: maxCategories,
            PredictiveConstants.CATEGORYCOLMSTATS: categoryColmListDict,
            PredictiveConstants.INDEXEDFEATURES: indexedFeatures,
            PredictiveConstants.LABEL: label,
            PredictiveConstants.ONEHOTENCODEDFEATURESLIST:
            oneHotEncodedFeaturesList,
            PredictiveConstants.INDEXEDLABELNAMEDICT: indexedLabelNameDict,
            PredictiveConstants.ISLABELINDEXED: isLabelIndexed,
            PredictiveConstants.VECTORFEATURES: vectorizedFeaturescolmName,
            PredictiveConstants.IDNAMEFEATURESORDERED: idNameFeaturesOrdered,
            PredictiveConstants.INDEXERPATHMAPPING: indexerPathMapping,
            PredictiveConstants.ONEHOTENCODERPATHMAPPING:
            oneHotEncoderPathMapping
        }
        return result
예제 #2
0
def main():
    spark = SparkSession \
        .builder \
        .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
        .getOrCreate()

    # use for only one file
    # filename = 'chicago_taxi_trips_2016_01.csv'

    # use for reading all files
    filename = '*'

    df = spark.read \
        .format('csv') \
        .options(header=True, inferSchema=True) \
        .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename))
    # df.printSchema()

    # Take a look at the top rows
    # df.limit(5).toPandas()

    # Check initial number of records
    # df.count()

    df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\
                     .withColumn('month', month(df.trip_start_timestamp))\
                     .withColumn('day', dayofmonth(df.trip_start_timestamp))\
                     .withColumn('hour', hour(df.trip_start_timestamp))

    df_features = df_with_hour.select('year', 'month', 'day', 'hour',
                                      'pickup_community_area',
                                      'dropoff_community_area')

    df_no_nulls = df_features.dropna()

    # df_no_nulls.count()

    # Create StringIndexer and fit + transform pickup data
    pickup_indexer = StringIndexer(inputCol='pickup_community_area',
                                   outputCol='pickup_community_area_indexed')

    pickup_indexer_model = pickup_indexer.fit(df_no_nulls)
    df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls)

    # Create StringIndexer and fit + transform dropoff data
    dropoff_indexer = StringIndexer(inputCol='dropoff_community_area',
                                    outputCol='dropoff_community_area_indexed')

    dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed)
    df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed)

    # Create OneHotEncoder and fit + transform pickup & dropoff data
    encoder = OneHotEncoderEstimator() \
        .setInputCols(['hour',
                       'pickup_community_area_indexed',
                       'dropoff_community_area_indexed']) \
        .setOutputCols(['hour_encoded',
                        'pickup_community_area_encoded',
                        'dropoff_community_area_encoded'])

    encoder_model = encoder.fit(df_dropoff_indexed)
    df_encoded = encoder_model.transform(df_dropoff_indexed)

    # df_encoded.printSchema()

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_model_key']

    # save the pickup stringINdexer and model
    pickup_indexer_name = 'pickup_indexer_name'
    pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name)
    pickup_indexer.write().overwrite().save(pickup_indexer_path)

    pickup_indexer_model_name = 'pickup_indexer_model_name'
    pickup_indexer_model_name_path = os.path.join(bucket, key,
                                                  pickup_indexer_model_name)
    pickup_indexer_model \
        .write() \
        .overwrite() \
        .save(pickup_indexer_model_name_path)

    # save the dropoff stringINdexer and model
    dropoff_indexer_name = 'dropoff_indexer_name'
    dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name)
    dropoff_indexer.write().overwrite().save(dropoff_indexer_path)

    dropoff_indexer_model_name = 'dropoff_indexer_model_name'
    dropoff_indexer_model_name_path = os.path.join(bucket, key,
                                                   dropoff_indexer_model_name)
    dropoff_indexer_model \
        .write() \
        .overwrite() \
        .save(dropoff_indexer_model_name_path)

    # save the one-hot encoder and model
    encoder_name = 'encoder_name'
    encoder_name_path = os.path.join(bucket, key, encoder_name)
    encoder.write().overwrite().save(encoder_name_path)

    encoder_model_name = 'encoder_model_name'
    encoder_model_name_path = os.path.join(bucket, key, encoder_model_name)
    encoder_model.write().overwrite().save(encoder_model_name_path)

    # make final dataframe and store back to S3
    df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded',
                                 'pickup_community_area_encoded',
                                 'dropoff_community_area_encoded')

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_data_key']

    output_path = os.path.join(bucket, key)

    df_final.write.partitionBy('year', 'month', 'day') \
            .parquet(output_path, mode='overwrite')