def get_parcels_to_spark(parcels_filepath='data/EXTR_Parcel.csv'): # Comment out to only use initial SparkSession # spark = SparkSession\ # .builder\ # .master('Local[4]')\ # .appName("Get_Parcel_Data")\ # .config("spark.master", "local")\ # .getOrCreate() # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame parcel_pd = get_parcels(parcels_filepath) parcel = spark.createDataFrame(parcel_pd) # Normalize numerical data numerical_cols = [ 'PcntUnusable', 'WfntFootage', ] numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='num_features') parcel = numerical_assembler.transform(parcel) parcel = StandardScaler( inputCol='num_features', outputCol='num_features_std').fit(parcel).transform(parcel) # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns cat_cols = [ 'Range', 'Township', 'Section', 'QuarterSection', 'Area', 'SubArea', 'LevyCode', 'CurrentZoning', 'PresentUse', 'SqFtLot', 'WaterSystem', 'SewerSystem', 'Access', 'Topography', 'StreetSurface', 'InadequateParking', 'MtRainier', 'Olympics', 'Cascades', 'Territorial', 'SeattleSkyline', 'PugetSound', 'LakeWashington', 'SmallLakeRiverCreek', 'OtherView', 'WfntLocation', 'WfntBank', 'WfntPoorQuality', 'WfntRestrictedAccess', 'WfntAccessRights', 'TidelandShoreland', 'LotDepthFactor', 'TrafficNoise', 'NbrBldgSites', 'Contamination', ] cat_index = [] dummies = [] for col in cat_cols: cat_index.append(col + '_index') dummies.append(col + '_dummy_vector') # Create and populate categorical index columns indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(parcel) for column in cat_cols ] cat_pipeline = Pipeline(stages=indexers) parcel = cat_pipeline.fit(parcel).transform(parcel) # Encode dummy_vector columns from categorical indeces encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies) model = encoder.fit(parcel) parcel = model.transform(parcel) # Drop categorical and index columns parcel = parcel.drop(*cat_cols) parcel = parcel.drop(*cat_index) parcel = parcel.drop(*numerical_cols) # Combine all features into single vector ignore = ['PIN'] assembler = VectorAssembler( inputCols=[col for col in parcel.columns if col not in ignore], outputCol='parcel_features') parcel = assembler.transform(parcel) # Drop all columns that are now in the features column ignore.append('parcel_features') parcel = parcel.drop(*[col for col in parcel.columns if col not in ignore]) # # Write to parquet - not sure if I will eventually open from this, but that's the idea # # gis.write.parquet('data/gis_parquet',mode='overwrite') return parcel
def gis_data_to_spark( numFolds, gis_filepath='data/Parcels_for_King_County_with_Address_with_Property_Information__parcel_address_area.csv' ): # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame gis_pd = get_gis_data(gis_filepath) gis_pd['fold'] = np.random.randint(0, numFolds, gis_pd.shape[0]) gis = spark.createDataFrame(gis_pd) # Normalize numerical data numerical_cols = [ 'LAT', 'LON', 'LOTSQFT', 'APPRLNDVAL', 'APPR_IMPR', 'TAX_LNDVAL', 'TAX_IMPR', 'Shape_Length', 'Shape_Area', 'value_per_area', 'improvement_over_land' ] numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='num_features') gis = numerical_assembler.transform(gis) gis = StandardScaler(inputCol='num_features', outputCol='num_features_std').fit(gis).transform(gis) # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns cat_cols = [ 'KCTP_STATE', 'SITETYPE', 'LEVYCODE', 'NEW_CONSTR', 'TAXVAL_RSN', 'QTS', 'SEC', 'TWP', 'RNG', 'KCA_ZONING', 'PROPTYPE', 'PREUSE_DESC' ] cat_index = [] dummies = [] for col in cat_cols: cat_index.append(col + '_index') dummies.append(col + '_dummy_vector') # Create and populate categorical index columns indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(gis) for column in cat_cols ] cat_pipeline = Pipeline(stages=indexers) gis = cat_pipeline.fit(gis).transform(gis) # Encode dummy_vector columns from categorical indeces encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies) model = encoder.fit(gis) gis = model.transform(gis) # Drop categorical and index columns gis = gis.drop(*cat_cols) gis = gis.drop(*cat_index) gis = gis.drop(*numerical_cols) # Combine all features into single vector ignore = ['PIN', 'MAJOR', 'MINOR', 'ADDR_FULL', 'TARGET', 'fold'] assembler = VectorAssembler( inputCols=[col for col in gis.columns if col not in ignore], outputCol='gis_features') gis = assembler.transform(gis) # Drop all columns that are now in the features column ignore.append('gis_features') gis = gis.drop(*[col for col in gis.columns if col not in ignore]) # Write to parquet - not sure if I will eventually open from this, but that's the idea # gis.write.parquet('data/gis_parquet',mode='overwrite') return gis