def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def _compute_hist(sdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) colname = sdf.columns[-1] bucket_name = "__{}_bucket".format(colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer( splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" ) # after bucketing values, groups and counts them result = ( bucketizer.transform(sdf) .select(bucket_name) .groupby(bucket_name) .agg(F.count("*").alias("count")) .toPandas() .sort_values(by=bucket_name) ) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]] pdf.columns = [bucket_name] return pdf[bucket_name]
def create_buckets(percentage_of_missing_ctus_per_partyid): """ Devide party ids by percentage of missing ctus into a list of 5 buckets > 0 < 0.25 > 0.25 < 0.5 > 0.5 < 0.75 > 0.75 < 0.99 > 0.99 Output: +--------+-----------------------+-------+ |party_id|percentage_missing_ctus|buckets| +--------+-----------------------+-------+ | 1| 0.2| 0.0| | 2| 0.33| 1.0| | 3| 1.0| 4.0| | 4| 0.75| 3.0| | 5| 0.6| 2.0| | 6| 0.6| 2.0| +--------+-----------------------+-------+ """ bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \ inputCol="percentage_missing_ctus", outputCol="buckets") df_of_buckets_ratio_between_imputed_distinct_ctus\ = bucketizer.setHandleInvalid("keep").\ transform(percentage_of_missing_ctus_per_partyid) return df_of_buckets_ratio_between_imputed_distinct_ctus
def get_binned_stat(self, df, colname, col_stat, n_split=10): splits = CommonUtils.frange(col_stat["min"], col_stat["max"], num_steps=n_split) splits = sorted(splits) splits_range = [(splits[idx], splits[idx + 1]) for idx in range(len(splits) - 1)] splits_data = {"splits": splits, "splits_range": splits_range} splits = splits_data["splits"] double_df = df.withColumn(colname, df[colname].cast(DoubleType())) bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas() str_splits_range = [ " to ".join([str(x[0]), str(x[1])]) for x in splits_range ] bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range)) bin_name_dict[n_split] = "null" histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply( lambda x: n_split if pd.isnull(x) else x) histogram_df["bins"] = histogram_df["orderIndex"].apply( lambda x: bin_name_dict[int(x)]) relevant_df = histogram_df[["bins", "count", "orderIndex"]] histogram_dict = relevant_df.T.to_dict().values() histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"]) output = [] for val in histogram_dict: output.append({"name": val["bins"], "value": val["count"]}) return output
def test_bucketizer(self): values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )] data = self.spark.createDataFrame(values, ["features"]) model = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], inputCol="features", outputCol="buckets") feature_count = len(data.select('features').first()) model_onnx = convert_sparkml( model, 'Sparkml Bucketizer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.setHandleInvalid("error").transform(data) expected = predicted.select("buckets").toPandas().values.astype( numpy.float32) data_np = [data.toPandas().values.astype(numpy.float32)] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['buckets'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def calc_histogram(self, bins): bucket_name = '__{}_bucket'.format(self.colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=self.colname, outputCol=bucket_name, handleInvalid="skip") # after bucketing values, groups and counts them result = (bucketizer.transform( self.data._kdf._sdf).select(bucket_name).groupby(bucket_name).agg( F.count('*').alias('count')).toPandas().sort_values( by=bucket_name)) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({ bucket_name: np.arange(0, len(bins) - 1), 'bucket': bins[:-1] }) # merges the bins with counts on it and fills remaining ones with zeros data = indexes.merge(result, how='left', on=[bucket_name]).fillna(0)[['count']] data.columns = [bucket_name] return data
def get_column_hist(self, column, bins): """return a list of counts corresponding to bins""" bins = list(copy.deepcopy(bins)) # take a copy since we are inserting and popping if bins[0] == -np.inf or bins[0] == -float("inf"): added_min = False bins[0] = -float("inf") else: added_min = True bins.insert(0, -float("inf")) if bins[-1] == np.inf or bins[-1] == float("inf"): added_max = False bins[-1] = float("inf") else: added_max = True bins.append(float("inf")) temp_column = self.spark_df.select(column).where(col(column).isNotNull()) bucketizer = Bucketizer( splits=bins, inputCol=column, outputCol="buckets") bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column) # This is painful to do, but: bucketizer cannot handle values outside of a range # (hence adding -/+ infinity above) # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound # for all but the last bin # But, since the last bin in our case will often be +infinity, we need to # find the number of values exactly equal to the upper bound to add those # We'll try for an optimization by asking for it at the same time if added_max == True: upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count() else: upper_bound_count = 0 hist_rows = bucketed.groupBy("buckets").count().collect() # Spark only returns buckets that have nonzero counts. hist = [0] * (len(bins) - 1) for row in hist_rows: hist[int(row["buckets"])] = row["count"] hist[-2] += upper_bound_count if added_min: below_bins = hist.pop(0) bins.pop(0) if below_bins > 0: logger.warning("Discarding histogram values below lowest bin.") if added_max: above_bins = hist.pop(-1) bins.pop(-1) if above_bins > 0: logger.warning("Discarding histogram values above highest bin.") return hist
def model_train(zipcode, complaint, day): print("Loading Data ...") data311 = spark.read.format("csv").option("header", "true").load("Data_Final/*.csv") infer_schema = "true" first_row_is_header = "true" delimiter = "," data311.registerTempTable("data311") data311 = data311.withColumn("ResTimeH", data311.Resolution_Time_Hours.cast('int')) data311 = data311.withColumn('day_of_week', dayofweek(data311['Created Date'])) data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int')) data311 = data311.filter(data311.ResTimeH > 0) data311 = data311.filter(data311.ResTimeH < 99) bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')], inputCol="ResTimeH", outputCol="categories") data311 = bucketizer.setHandleInvalid("keep").transform(data311) X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories'] X = X.filter(X["Zip"].isNotNull()) X = X.filter(X["Complaint_Type_Groups"].isNotNull()) X = X.filter(X["day_of_week"].isNotNull()) stage_1 = StringIndexer(inputCol="Complaint_Type_Groups", outputCol="categoryIndex") stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"], outputCols=["categoryVec"]) stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'], outputCol="features") stage_4 = StandardScaler().setInputCol("features").setOutputCol( "Scaled_ip_features") stage_5 = LogisticRegression(labelCol="categories", featuresCol="Scaled_ip_features") # setup the pipeline pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5]) # fit the pipeline model and transform the data as defined pipeline_model = pipeline.fit(X) zipcode = int(zipcode) day = int(day) input_variables = pd.DataFrame( [[zipcode, complaint, day]], columns=['Zip', 'Complaint_Type_Groups', 'day_of_week']) input_variables = spark.createDataFrame(input_variables) transformed = pipeline_model.transform(input_variables) ans = transformed.select(collect_list('prediction')).first()[0] if (ans[0] == 0.0): prediction = "Your complaint will be resolved within 2 hours." elif (ans[0] == 1.0): prediction = "Your complaint will be resolved within 2-6 hours." else: prediction = "Your complaint will be resolved after 6 hours" return prediction
def transform_spark(data, columns, args, transformed_column_name): from pyspark.ml.feature import Bucketizer import pyspark.sql.functions as F new_b = Bucketizer( splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name ) return new_b.transform(data).withColumn( transformed_column_name, F.col(transformed_column_name).cast("int") )
def get_binned_dataframe(df, bin_name, variable_name, edges): ''' Produces a dataframe with a new column `bin_name` corresponding to the variable `variable_name` binned with the given `edges`. ''' splits = [-float('inf')]+list(edges)+[float('inf')] bucketizer = Bucketizer( splits=splits, inputCol=variable_name, outputCol=bin_name) binnedDF = bucketizer.transform(df) return binnedDF
def buckert(self, df, column): """ 按指定边界 分桶Bucketizer """ splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')] # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=column, outputCol=column + '_bucketed') # splits指定分桶边界 bucketedData = bucketizer.transform(df) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def test_save_and_load_on_nested_list_params(self): temp_path = tempfile.mkdtemp() splitsArray = [ [-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.1, 1.2, float("inf")], ] bucketizer = Bucketizer(splitsArray=splitsArray, inputCols=["values", "values"], outputCols=["b1", "b2"]) savePath = temp_path + "/bk" bucketizer.write().overwrite().save(savePath) loadedBucketizer = Bucketizer.load(savePath) assert loadedBucketizer.getSplitsArray() == splitsArray
def bucketizer_splits(dataFrame, inputCol, splits=[-float('inf'), -0.5, 0.0, 0.5, float('inf')]): # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # splits指定分桶边界 bucketedData = bucketizer.transform(dataFrame) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def pre_processing(dataFrame): splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show()
def add_age_id(spark, df, logger): """Calculate the age_id by splitting the visitor age into buckets""" agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21, 26, 36, 46, 56, 66, float('Inf') ], inputCol="i94bir", outputCol="agebuckets") agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df) age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\ .otherwise(col("agebuckets") .cast(IntegerType())) ) logger.info("Added age_id") age_id_df.persist() return age_id_df
def main_emm_recode_demos(emm_raw_sdf): recode_demo_pipeline = Pipeline(stages=[ Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age1"), Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age7"), Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age8"), # Bucketizer(splits=[-25, 0, 25., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income1"), # Bucketizer(splits=[-25, 0, 25., 35., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income9"), IfElseTransformer( vals=[83], inputCol='hispanicid', outputCol='hispanic'), IfElseTransformer( vals=['M'], inputCol='gender_char', outputCol='gender'), IfElseTransformer(vals=[86], inputCol='raceid', outputCol='race_back'), IfElseTransformer(vals=[88], inputCol='raceid', outputCol='race_asian'), YesNoTransformer(inputCol='dvr_flag', outputCol='dvr'), YesNoTransformer(inputCol='cable_plus_flag', outputCol='cableplus'), YesNoTransformer(inputCol='video_game_owner_flag', outputCol='video_game'), YesNoTransformer(inputCol='internet_access_flag', outputCol='internet'), YesNoTransformer(inputCol='pay_cable_flag', outputCol='paycable'), YesNoTransformer( inputCol='television_high_definition_display_capability_flag', outputCol='hdtv'), YesNoTransformer(inputCol='alternative_delivery_flag', outputCol='satellite'), IsInTransformer(isin_bins=[[0, 1], [2], [3, 4, 5, 6, 7], [8]], inputCol='nielsen_occupation_code', outputCol='occupation1'), IsInTransformer(isin_bins=[[0, 8, 9, 10, 11, 12], [13, 14, 15], [16], [18, 19, 20]], inputCol='education_level_number', outputCol='education7'), IsInTransformer(isin_bins=[[16, 18, 19, 20], [0, 8, 9, 10, 11, 12, 13, 14, 15]], inputCol='education_level_number', outputCol='education2'), IsInTransformer(isin_bins=[['A'], ['B'], ['C'], ['D']], inputCol='county_size_code', outputCol='county_size') ]) return None
def _bucketize_age_column( self, dataframe: DataFrame, input_col: str, output_col: str) -> Tuple[DataFrame, int, List[str]]: bucketizer = Bucketizer(splits=self.age_groups, inputCol=input_col, outputCol=output_col) output = bucketizer.setHandleInvalid("keep").transform(dataframe) splits = [s for s in bucketizer.getSplits()] mapping = [ "[{}, {})".format(splits[i], splits[i + 1]) for i in range(len(splits) - 1) ] n_age_groups = len(mapping) return output, n_age_groups, mapping
def add_duration_id(spark, df, logger): """Calculate the visitduration_id by splitting the visit duration into buckets""" durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate")) ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22, 29, float('Inf') ], inputCol="duration_days", outputCol="ddbuckets") ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df) dur_id_df = ddbuck_df.withColumn("visitduration_id", when(isnull(col("arrdate")) | isnull(col("depdate")), 999)\ .otherwise(col("ddbuckets").cast(IntegerType())) ) logger.info("Added duration_id") return dur_id_df
def _transform_data(self, data): data_handling = self.data_settings.get('data_handling', {}) # interactions if data_handling.get('interactions', False): columns_list = list(data.columns) columns_list.remove(self.model_settings['variable_to_predict']) for col1 in columns_list: for col2 in columns_list: if col1 != col2: name = str(col1) + '_' + str(col2) reverse_name = str(col2) + '_' + str(col1) if reverse_name not in list(data.columns): data = data.withColumn(name, (F.col(col1) + 1) * (F.col(col2) + 1)) # binning for feature_to_bin in data_handling.get("features_to_bin", []): min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0] max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0] full_bins = [(min_val - 1) ] + feature_to_bin['bins'] + [(max_val + 1)] bucketizer = Bucketizer(splits=full_bins, inputCol=feature_to_bin['name'], outputCol=feature_to_bin['name'] + '_binned') data = bucketizer.transform(data) # transformation for col in data_handling.get("features_handling", {}).keys(): transformation_array = data_handling["features_handling"][col].get( "transformation", []) # applying transformations for feature_transformation_method in transformation_array: data = data.withColumn( col + '_' + feature_transformation_method, eval('F.' + feature_transformation_method)(col)) # dropping features features_to_remove = data_handling.get('features_to_remove', []) if len(features_to_remove) > 0: data = data.drop(*[ feature for feature in features_to_remove if feature in data.columns ]) return data
def bucketize(self, df, field): df = df.withColumn(field, df[field].cast("double")) max = df.agg({field: "max"}).collect()[0][0] min = df.agg({field: "min"}).collect()[0][0] stddev = df.agg({field: "stddev"}).collect()[0][0] number_of_buckets = 1 if stddev != 0: number_of_buckets = ((max - min) // (stddev)) buckets = np.arange(number_of_buckets, dtype=np.float).tolist() buckets = [-float('inf')] + buckets + [float('inf')] bucketizer = Bucketizer(splits=buckets, inputCol=field, outputCol=field + '_bucketized') print("Bucketizing column: ", field) bucketized_features = bucketizer.transform(df) return bucketized_features
def generateGroupedMeasureDataDict(self, measure_column): splits_data = self.get_measure_column_splits(self._data_frame, measure_column, 4) splits = splits_data["splits"] double_df = self._data_frame.withColumn( measure_column, self._data_frame[measure_column].cast(DoubleType())) bucketizer = Bucketizer(inputCol=measure_column, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) unique_bins = binned_df.select("BINNED_INDEX").distinct().collect() unique_bins = [int(x[0]) for x in unique_bins] binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"])) output = {"bins": binned_index_dict, "data": binned_df} return output
def OneHotEncoder(self): """ Converts string-type categories to indexes, splits continuous data interval to indexes, encodes the categorical data using One-Hot encoding. """ splits = [-float("inf"), 500, 1200, 1700, float("inf")] self.bucketizer = Bucketizer( splitsArray=[splits, splits, splits], inputCols=["CRSDepTime", "CRSArrTime", "DepTime"], outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"]) self.varIdxer = StringIndexer( inputCol="OrigDest", outputCol="IndOrigDest").setHandleInvalid("skip") self.oneHot = OneHotEncoder(inputCols=[ 'Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime', 'IndOrigDest', 'CatDepTime' ], outputCols=[ 'HotMonth', 'HotDayOfWeek', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotIndOrigDest', 'HotDepTime' ]).setHandleInvalid("keep")
def transform_data(content_items): content_items = content_items.withColumn('receive_date', F.to_date( F.col('time'))).drop('time') bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS, inputCol='days_from_eula', outputCol='days_from_eula_bin', handleInvalid='skip') content_items = bucketizer.transform(content_items) \ .drop('days_from_eula') \ .withColumn( 'days_from_eula_bin', convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE) ) print('content item data transformed') return content_items
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
def bucketize(self, splits, target_col): self._bucket_name = 'bucket_' + target_col bucketizer = Bucketizer(inputCol=target_col, outputCol=self._bucket_name) splits.sort() bucketizer.setSplits(splits) column_data_types = { field.name: field.dataType for field in self._data_frame.schema.fields } if column_data_types[target_col] != DoubleType: self._data_frame = self._data_frame.select(*[ col(target_col).cast('double').alias(target_col) if column == target_col else column for column in self._data_frame.columns ]) self._data_frame = bucketizer.transform(self._data_frame) return self._bucket_name
def discretize(self, test=False): """ Discretize a continous feature into a discrete one """ for col in list(self.config_dict.keys()): # check if the discretizer transformation needs to be applied if self.config_dict[col]["discretize"]["apply"]: splits = self.config_dict[col]["discretize"]["value"] splits = [-math.inf] + splits splits = splits + [math.inf] bucketizer = Bucketizer(splits=splits, inputCol=col, outputCol="{}_discretized".format(col)) if test: self.test_data = bucketizer.transform(self.test_data) else: self.train_data = bucketizer.transform(self.train_data)
def bin_columns(self, colsToBin): for bincol in colsToBin: if self._pandas_flag: try: minval, maxval = float(min( self._data_frame[bincol])), float( max(self._data_frame[bincol])) n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) self._data_frame[bincol] = pd.cut( self._data_frame[bincol], bins=splitsData["splits"], labels=list(splitsData['bin_mapping'].values()), right=True, include_lowest=True) except Exception as e: print("Binning failed for : ", bincol) else: try: minval, maxval = self._data_frame.select([ FN.max(bincol).alias("max"), FN.min(bincol).alias("min") ]).collect()[0] n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) splits = splitsData["splits"] self._data_frame = self._data_frame.withColumn( bincol, self._data_frame[bincol].cast(DoubleType())) bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) self._data_frame = bucketizer.transform(self._data_frame) mapping_expr = create_map([ lit(x) for x in chain( *list(splitsData["bin_mapping"].items())) ]) # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( bincol, mapping_expr.getItem(col("BINNED_INDEX"))) self._data_frame = self._data_frame.select(self.columns) except Exception as e: print("Binning failed for : ", bincol)
def strat_histogram(sdf, colname, bins=10, categorical=False): if categorical: result = sdf.cols[colname]._value_counts(dropna=False, raw=True) if hasattr(result.index, 'levels'): indexes = pd.MultiIndex.from_product( result.index.levels[:-1] + [result.reset_index()[colname].unique().tolist()], names=result.index.names) result = (pd.DataFrame(index=indexes).join( result.to_frame(), how='left').fillna(0)[result.name].astype(result.dtype)) start_values = result.index.tolist() else: bucket_name = '__{}_bucket'.format(colname) strata = sdf._handy.strata_colnames colnames = strata + ensure_list(bucket_name) start_values = np.linspace( *sdf.agg(F.min(colname), F.max(colname)).rdd.map(tuple).collect()[0], bins + 1) bucketizer = Bucketizer(splits=start_values, inputCol=colname, outputCol=bucket_name, handleInvalid="skip") result = ( bucketizer.transform(sdf).select(colnames).groupby(colnames).agg( F.count('*').alias('count')).toPandas().sort_values( by=colnames)) indexes = pd.DataFrame({ bucket_name: np.arange(0, bins), 'bucket': start_values[:-1] }) if len(strata): indexes = (indexes.assign(key=1).merge( result[strata].drop_duplicates().assign(key=1), on='key').drop(columns=['key'])) result = indexes.merge(result, how='left', on=strata + [bucket_name]).fillna(0)[strata + [bucket_name, 'count']] return start_values, result
def test_list_list_float(self): b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]]) self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]]) self.assertTrue(all([type(v) == list for v in b.getSplitsArray()])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
def calc(df, column: str, bins=50, bin_width=None): """ Calculate the buckets and weights for a histogram Returns ------- (buckets, weights): tuple of two lists """ if bins is None and bin_width is None: raise ValueError("Must indicate bins or bin_width") elif bins is None and bin_width is not None: raise ValueError("bins and bin_width arguments are mutually exclusive") # Calculate buckets data = df[[column]] int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType) col_type = data.schema.fields[0].dataType if not isinstance(col_type, int_types): raise ValueError( "hist method requires numerical or datetime columns, nothing to plot." ) # Calculate buckets buckets = utils.spark_buckets(data, column, bins=bins, bin_width=bin_width) # Calculate counts based on the buckets bucketizer = Bucketizer(splits=buckets, inputCol=column, outputCol="bucket") buckets_df = bucketizer.transform(data) histogram = buckets_df.groupby("bucket").agg(F.count(column).alias("count")) histogram = histogram.orderBy("bucket", ascending=True) # Create weights (locally) hist_pd = histogram.toPandas() # Create a new DF with complete buckets and empty counts if needed full_buckets = pd.DataFrame(columns=["bucket"]) full_buckets["bucket"] = np.arange(len(buckets)) full_buckets = full_buckets.merge(hist_pd, on="bucket", how="left") weights = full_buckets["count"] return buckets, weights
# Add a category column via pyspark.sql.DataFrame.withColumn manual_bucketized_features = features_with_route.withColumn( "ArrDelayBucket", dummy_function_udf(features['ArrDelay']) ) manual_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay # from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) ml_bucketized_features = bucketizer.transform(features_with_route) # Check the buckets out ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]:
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
from pyspark.ml.feature import VectorAssembler va = VectorAssembler().setInputCols(["int1", "int2", "int3"]) va.transform(fakeIntDF).show() # COMMAND ---------- contDF = spark.range(20).selectExpr("cast(id as double)") # COMMAND ---------- from pyspark.ml.feature import Bucketizer bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0] bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id") bucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import QuantileDiscretizer bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features")
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="BucketizerExample") sqlContext = SQLContext(sc) # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-0.5,), (-0.3,), (0.0,), (0.2,)] dataFrame = sqlContext.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) bucketedData.show() # $example off$ sc.stop()
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BucketizerExample")\ .getOrCreate() # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1)) bucketedData.show() # $example off$ spark.stop()
def test_list_float(self): b = Bucketizer(splits=[1, 4]) self.assertEqual(b.getSplits(), [1.0, 4.0]) self.assertTrue(all([type(v) == float for v in b.getSplits()])) self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))