def bucketize(self, df, c): bucketizer4 = Bucketizer(splits=[-float("inf"), 0, 0.25, 0.5, 0.75, 1.0 ,float("inf")], inputCol=c, outputCol="B4_"+c) bucketizer10 = Bucketizer(splits=[-float("inf"), 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ,float("inf")], inputCol=c, outputCol="B10_"+c) df = bucketizer4.transform(df.select('snapshotDate','ID',c)) df = bucketizer10.transform(df) return( df.select('snapshotDate','ID','B4_'+c, 'B10_'+c) )
def calc(df, col_x: str, col_y: str, bins=50, bin_width=None): """ Calculate the buckets and weights for a histogram Returns ------- (buckets, weights): tuple of two lists """ # Calculate buckets data = df[[col_x, col_y]] # Check int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType) col_type = data.schema.fields[0].dataType if not isinstance(col_type, int_types): raise ValueError( "hist2d method requires numerical or datetime columns, nothing to plot." ) # Calculate buckets buckets_x = utils.spark_buckets(data, col_x, bins=bins, bin_width=bin_width) buckets_y = utils.spark_buckets(data, col_y, bins=bins, bin_width=bin_width) # Generate DF with buckets bucketizer = Bucketizer(splits=buckets_x, inputCol=col_x, outputCol="bucket_x") buckets_df = bucketizer.transform(data) bucketizer = Bucketizer(splits=buckets_y, inputCol=col_y, outputCol="bucket_y") buckets_df = bucketizer.transform(buckets_df) histogram = buckets_df.groupby("bucket_x", "bucket_y").agg( F.count(col_x).alias("count")) # Create weights matrix (locally) hist_pd = histogram.toPandas() weights = np.zeros((bins, bins)) for index, row in hist_pd.iterrows(): weights[int(row["bucket_x"]), int(row["bucket_y"])] = row["count"] # Mask values that are zero so they look transparent weights = np.ma.masked_where(weights == 0, weights) len(buckets_x) len(weights) return buckets_x, buckets_y, weights
def calc_histogram(self, bins): bucket_name = '__{}_bucket'.format(self.colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=self.colname, outputCol=bucket_name, handleInvalid="skip") # after bucketing values, groups and counts them result = (bucketizer.transform( self.data._kdf._sdf).select(bucket_name).groupby(bucket_name).agg( F.count('*').alias('count')).toPandas().sort_values( by=bucket_name)) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({ bucket_name: np.arange(0, len(bins) - 1), 'bucket': bins[:-1] }) # merges the bins with counts on it and fills remaining ones with zeros data = indexes.merge(result, how='left', on=[bucket_name]).fillna(0)[['count']] data.columns = [bucket_name] return data
def _compute_hist(sdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) colname = sdf.columns[-1] bucket_name = "__{}_bucket".format(colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer( splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" ) # after bucketing values, groups and counts them result = ( bucketizer.transform(sdf) .select(bucket_name) .groupby(bucket_name) .agg(F.count("*").alias("count")) .toPandas() .sort_values(by=bucket_name) ) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]] pdf.columns = [bucket_name] return pdf[bucket_name]
def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def get_binned_stat(self, df, colname, col_stat, n_split=10): splits = CommonUtils.frange(col_stat["min"], col_stat["max"], num_steps=n_split) splits = sorted(splits) splits_range = [(splits[idx], splits[idx + 1]) for idx in range(len(splits) - 1)] splits_data = {"splits": splits, "splits_range": splits_range} splits = splits_data["splits"] double_df = df.withColumn(colname, df[colname].cast(DoubleType())) bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas() str_splits_range = [ " to ".join([str(x[0]), str(x[1])]) for x in splits_range ] bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range)) bin_name_dict[n_split] = "null" histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply( lambda x: n_split if pd.isnull(x) else x) histogram_df["bins"] = histogram_df["orderIndex"].apply( lambda x: bin_name_dict[int(x)]) relevant_df = histogram_df[["bins", "count", "orderIndex"]] histogram_dict = relevant_df.T.to_dict().values() histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"]) output = [] for val in histogram_dict: output.append({"name": val["bins"], "value": val["count"]}) return output
def discretize(self, test=False): """ Discretize a continous feature into a discrete one """ for col in list(self.config_dict.keys()): # check if the discretizer transformation needs to be applied if self.config_dict[col]["discretize"]["apply"]: splits = self.config_dict[col]["discretize"]["value"] splits = [-math.inf] + splits splits = splits + [math.inf] bucketizer = Bucketizer(splits=splits, inputCol=col, outputCol="{}_discretized".format(col)) if test: self.test_data = bucketizer.transform(self.test_data) else: self.train_data = bucketizer.transform(self.train_data)
def get_binned_dataframe(df, bin_name, variable_name, edges): ''' Produces a dataframe with a new column `bin_name` corresponding to the variable `variable_name` binned with the given `edges`. ''' splits = [-float('inf')]+list(edges)+[float('inf')] bucketizer = Bucketizer( splits=splits, inputCol=variable_name, outputCol=bin_name) binnedDF = bucketizer.transform(df) return binnedDF
def transform_spark(data, columns, args, transformed_column_name): from pyspark.ml.feature import Bucketizer import pyspark.sql.functions as F new_b = Bucketizer( splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name ) return new_b.transform(data).withColumn( transformed_column_name, F.col(transformed_column_name).cast("int") )
def buckert(self, df, column): """ 按指定边界 分桶Bucketizer """ splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')] # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=column, outputCol=column + '_bucketed') # splits指定分桶边界 bucketedData = bucketizer.transform(df) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def bucketizer_splits(dataFrame, inputCol, splits=[-float('inf'), -0.5, 0.0, 0.5, float('inf')]): # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # splits指定分桶边界 bucketedData = bucketizer.transform(dataFrame) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def pre_processing(dataFrame): splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show()
def _transform_data(self, data): data_handling = self.data_settings.get('data_handling', {}) # interactions if data_handling.get('interactions', False): columns_list = list(data.columns) columns_list.remove(self.model_settings['variable_to_predict']) for col1 in columns_list: for col2 in columns_list: if col1 != col2: name = str(col1) + '_' + str(col2) reverse_name = str(col2) + '_' + str(col1) if reverse_name not in list(data.columns): data = data.withColumn(name, (F.col(col1) + 1) * (F.col(col2) + 1)) # binning for feature_to_bin in data_handling.get("features_to_bin", []): min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0] max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0] full_bins = [(min_val - 1) ] + feature_to_bin['bins'] + [(max_val + 1)] bucketizer = Bucketizer(splits=full_bins, inputCol=feature_to_bin['name'], outputCol=feature_to_bin['name'] + '_binned') data = bucketizer.transform(data) # transformation for col in data_handling.get("features_handling", {}).keys(): transformation_array = data_handling["features_handling"][col].get( "transformation", []) # applying transformations for feature_transformation_method in transformation_array: data = data.withColumn( col + '_' + feature_transformation_method, eval('F.' + feature_transformation_method)(col)) # dropping features features_to_remove = data_handling.get('features_to_remove', []) if len(features_to_remove) > 0: data = data.drop(*[ feature for feature in features_to_remove if feature in data.columns ]) return data
def generateGroupedMeasureDataDict(self, measure_column): splits_data = self.get_measure_column_splits(self._data_frame, measure_column, 4) splits = splits_data["splits"] double_df = self._data_frame.withColumn( measure_column, self._data_frame[measure_column].cast(DoubleType())) bucketizer = Bucketizer(inputCol=measure_column, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) unique_bins = binned_df.select("BINNED_INDEX").distinct().collect() unique_bins = [int(x[0]) for x in unique_bins] binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"])) output = {"bins": binned_index_dict, "data": binned_df} return output
def bucketize(self, df, field): df = df.withColumn(field, df[field].cast("double")) max = df.agg({field: "max"}).collect()[0][0] min = df.agg({field: "min"}).collect()[0][0] stddev = df.agg({field: "stddev"}).collect()[0][0] number_of_buckets = 1 if stddev != 0: number_of_buckets = ((max - min) // (stddev)) buckets = np.arange(number_of_buckets, dtype=np.float).tolist() buckets = [-float('inf')] + buckets + [float('inf')] bucketizer = Bucketizer(splits=buckets, inputCol=field, outputCol=field + '_bucketized') print("Bucketizing column: ", field) bucketized_features = bucketizer.transform(df) return bucketized_features
def transform_data(content_items): content_items = content_items.withColumn('receive_date', F.to_date( F.col('time'))).drop('time') bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS, inputCol='days_from_eula', outputCol='days_from_eula_bin', handleInvalid='skip') content_items = bucketizer.transform(content_items) \ .drop('days_from_eula') \ .withColumn( 'days_from_eula_bin', convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE) ) print('content item data transformed') return content_items
def bucketize(self, splits, target_col): self._bucket_name = 'bucket_' + target_col bucketizer = Bucketizer(inputCol=target_col, outputCol=self._bucket_name) splits.sort() bucketizer.setSplits(splits) column_data_types = { field.name: field.dataType for field in self._data_frame.schema.fields } if column_data_types[target_col] != DoubleType: self._data_frame = self._data_frame.select(*[ col(target_col).cast('double').alias(target_col) if column == target_col else column for column in self._data_frame.columns ]) self._data_frame = bucketizer.transform(self._data_frame) return self._bucket_name
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result
def bin_columns(self, colsToBin): for bincol in colsToBin: if self._pandas_flag: try: minval, maxval = float(min( self._data_frame[bincol])), float( max(self._data_frame[bincol])) n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) self._data_frame[bincol] = pd.cut( self._data_frame[bincol], bins=splitsData["splits"], labels=list(splitsData['bin_mapping'].values()), right=True, include_lowest=True) except Exception as e: print("Binning failed for : ", bincol) else: try: minval, maxval = self._data_frame.select([ FN.max(bincol).alias("max"), FN.min(bincol).alias("min") ]).collect()[0] n_split = 10 splitsData = CommonUtils.get_splits( minval, maxval, n_split) splits = splitsData["splits"] self._data_frame = self._data_frame.withColumn( bincol, self._data_frame[bincol].cast(DoubleType())) bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) self._data_frame = bucketizer.transform(self._data_frame) mapping_expr = create_map([ lit(x) for x in chain( *list(splitsData["bin_mapping"].items())) ]) # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( bincol, mapping_expr.getItem(col("BINNED_INDEX"))) self._data_frame = self._data_frame.select(self.columns) except Exception as e: print("Binning failed for : ", bincol)
def strat_histogram(sdf, colname, bins=10, categorical=False): if categorical: result = sdf.cols[colname]._value_counts(dropna=False, raw=True) if hasattr(result.index, 'levels'): indexes = pd.MultiIndex.from_product( result.index.levels[:-1] + [result.reset_index()[colname].unique().tolist()], names=result.index.names) result = (pd.DataFrame(index=indexes).join( result.to_frame(), how='left').fillna(0)[result.name].astype(result.dtype)) start_values = result.index.tolist() else: bucket_name = '__{}_bucket'.format(colname) strata = sdf._handy.strata_colnames colnames = strata + ensure_list(bucket_name) start_values = np.linspace( *sdf.agg(F.min(colname), F.max(colname)).rdd.map(tuple).collect()[0], bins + 1) bucketizer = Bucketizer(splits=start_values, inputCol=colname, outputCol=bucket_name, handleInvalid="skip") result = ( bucketizer.transform(sdf).select(colnames).groupby(colnames).agg( F.count('*').alias('count')).toPandas().sort_values( by=colnames)) indexes = pd.DataFrame({ bucket_name: np.arange(0, bins), 'bucket': start_values[:-1] }) if len(strata): indexes = (indexes.assign(key=1).merge( result[strata].drop_duplicates().assign(key=1), on='key').drop(columns=['key'])) result = indexes.merge(result, how='left', on=strata + [bucket_name]).fillna(0)[strata + [bucket_name, 'count']] return start_values, result
def calc(df, column: str, bins=50, bin_width=None): """ Calculate the buckets and weights for a histogram Returns ------- (buckets, weights): tuple of two lists """ if bins is None and bin_width is None: raise ValueError("Must indicate bins or bin_width") elif bins is None and bin_width is not None: raise ValueError("bins and bin_width arguments are mutually exclusive") # Calculate buckets data = df[[column]] int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType) col_type = data.schema.fields[0].dataType if not isinstance(col_type, int_types): raise ValueError( "hist method requires numerical or datetime columns, nothing to plot." ) # Calculate buckets buckets = utils.spark_buckets(data, column, bins=bins, bin_width=bin_width) # Calculate counts based on the buckets bucketizer = Bucketizer(splits=buckets, inputCol=column, outputCol="bucket") buckets_df = bucketizer.transform(data) histogram = buckets_df.groupby("bucket").agg(F.count(column).alias("count")) histogram = histogram.orderBy("bucket", ascending=True) # Create weights (locally) hist_pd = histogram.toPandas() # Create a new DF with complete buckets and empty counts if needed full_buckets = pd.DataFrame(columns=["bucket"]) full_buckets["bucket"] = np.arange(len(buckets)) full_buckets = full_buckets.merge(hist_pd, on="bucket", how="left") weights = full_buckets["count"] return buckets, weights
def biVariate(self, columns=None, buckets=5): if not columns: columns = self.data.drop(self.targetColumn, self.idColumn).columns '''Implements functionality of pd.melt; Transforms dataframe from wide to long''' # Create and explode an array of (column_name, column_value) structs melter = explode( array([ struct(lit(colnames).alias("key"), col(colnames).alias("val")) for colnames in columns ])).alias("kvs") long_data = self.data.select(melter, self.targetColumn) \ .selectExpr(self.targetColumn, "kvs.key AS key", "kvs.val AS val") observations = self.count split_val = [ i / buckets for i in range(buckets, (observations * buckets) + 1, observations - 1) ] bucketizer = Bucketizer(splits=split_val, inputCol="row", outputCol="bucket") biv = bucketizer.transform( long_data.select( self.targetColumn, 'key', 'val', row_number().over(Window.partitionBy('key').orderBy('val')).alias('row') ) ) \ .groupby('key', 'bucket') \ .agg( count('*').alias('num_records'), min('val').alias('bucket_min'), max('val').alias('bucket_max'), sum('target').alias('ones') ) \ .withColumn('event_rate', 100 * col('ones') / col('num_records')) \ .orderBy('key', 'bucket') return biv.toPandas()
def bin_columns(self, colsToBin): for bincol in colsToBin: minval, maxval = self._data_frame.select( [FN.max(bincol).alias("max"), FN.min(bincol).alias("min")]).collect()[0] n_split = 10 splitsData = CommonUtils.get_splits(minval, maxval, n_split) splits = splitsData["splits"] self._data_frame = self._data_frame.withColumn( bincol, self._data_frame[bincol].cast(DoubleType())) bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) self._data_frame = bucketizer.transform(self._data_frame) mapping_expr = create_map( [lit(x) for x in chain(*splitsData["bin_mapping"].items())]) self._data_frame = self._data_frame.withColumnRenamed( "bincol", bincol + "JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( bincol, mapping_expr.getItem(col("BINNED_INDEX"))) self._data_frame = self._data_frame.select(self.columns)
def spark_cut(df, col_name, bins, labels): """ Turns a continuous variable into categorical. :param df: a spark dataframe :param col_name: the continuous column to be categorized. :param bins: lower and upper bounds. must be sorted ascending and encompass the col entire range. :param labels: labels for each category. should be len(bins)-1 :return: a spark dataframe with the specified column binned and labeled as specified. """ bucketizer = Bucketizer(splits=bins, inputCol=col_name, outputCol=col_name + '_binned') df = bucketizer.transform(df) label_array = F.array(*(F.lit(label) for label in labels)) df = df.withColumn( col_name, label_array.getItem(F.col(col_name + '_binned').cast('integer'))) df = df.drop(col_name + '_binned') return df
def main(self, sc, *args): """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons """ sqlContext = SQLContext(sc) # For each key in the output dictionary of the Initiate task, i.e. train and test for inputFile in Initiate(self.input_file, self.output_path).output(): df = sqlContext.read.csv(Initiate( self.input_file, self.output_path).output()[inputFile].path, sep=",", header=True, inferSchema=True) # Select final list of features list_features = ["Age", "Sex_indexed", "Fare", "Survived"] df = df.select(*list_features) # Replace missing values cols_missing = ["Age"] for col in cols_missing: imputer = Imputer(inputCols=[col], outputCols=[ "{}_replace_missings".format(col) ]).setMissingValue(26.0) df = imputer.fit(df).transform(df) # Discretize cols_disc = { "Age_replace_missings": [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf], "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf], } for col in cols_disc: bucketizer = Bucketizer(splits=cols_disc[col], inputCol=col, outputCol="{}_discretized".format(col)) df = bucketizer.transform(df) df.write.csv(self.output()[inputFile].path, header=True)
def bucketize(self, splits, target_col): self._bucket_name = 'bucket_' + target_col if self._pandas_flag: ''' TO DO: this method is not being used anywhere ''' pass else: bucketizer = Bucketizer(inputCol=target_col, outputCol=self._bucket_name) splits.sort() bucketizer.setSplits(splits) column_data_types = { field.name: field.dataType for field in self._data_frame.schema.fields } if column_data_types[target_col] != DoubleType: self._data_frame = self._data_frame.select(*[ col(target_col).cast('double'). alias(target_col) if column == target_col else column for column in self._data_frame.columns ]) self._data_frame = bucketizer.transform(self._data_frame) return self._bucket_name
# from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="BucketizerExample") sqlContext = SQLContext(sc) # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-0.5, ), (-0.3, ), (0.0, ), (0.2, )] dataFrame = sqlContext.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) bucketedData.show() # $example off$ sc.stop()
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # Test/train split training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
from pyspark.ml.feature import VectorAssembler va = VectorAssembler().setInputCols(["int1", "int2", "int3"]) va.transform(fakeIntDF).show() # COMMAND ---------- contDF = spark.range(20).selectExpr("cast(id as double)") # COMMAND ---------- from pyspark.ml.feature import Bucketizer bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0] bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id") bucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import QuantileDiscretizer bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show()
def compute_hist(psdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) sdf = psdf._internal.spark_frame scols = [] input_column_names = [] for label in psdf._internal.column_labels: input_column_name = name_like_string(label) input_column_names.append(input_column_name) scols.append( psdf._internal.spark_column_for(label).alias( input_column_name)) sdf = sdf.select(*scols) # 1. Make the bucket output flat to: # +----------+-------+ # |__group_id|buckets| # +----------+-------+ # |0 |0.0 | # |0 |0.0 | # |0 |1.0 | # |0 |2.0 | # |0 |3.0 | # |0 |3.0 | # |1 |0.0 | # |1 |1.0 | # |1 |1.0 | # |1 |2.0 | # |1 |1.0 | # |1 |0.0 | # +----------+-------+ colnames = sdf.columns bucket_names = ["__{}_bucket".format(colname) for colname in colnames] output_df = None for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip") bucket_df = bucketizer.transform(sdf) if output_df is None: output_df = bucket_df.select( F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")) else: output_df = output_df.union( bucket_df.select( F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket"))) # 2. Calculate the count based on each group and bucket. # +----------+-------+------+ # |__group_id|buckets| count| # +----------+-------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+-------+------+ result = (output_df.groupby("__group_id", "__bucket").agg( F.count("*").alias("count")).toPandas().sort_values( by=["__group_id", "__bucket"])) # 3. Fill empty bins and calculate based on each group id. From: # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # +----------+--------+------+ # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+--------+------+ # # to: # +-----------------+ # |__values1__bucket| # +-----------------+ # |2 | # |1 | # |1 | # |2 | # |0 | # +-----------------+ # +-----------------+ # |__values2__bucket| # +-----------------+ # |2 | # |3 | # |1 | # |0 | # |0 | # +-----------------+ output_series = [] for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)): current_bucket_result = result[result["__group_id"] == i] # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[["count"]] pdf.columns = [input_column_name] output_series.append(pdf[input_column_name]) return output_series
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = ["DepDelay", "Distance"] index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
dummy_function_udf(features['ArrDelay']) ) manual_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay # from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) ml_bucketized_features = bucketizer.transform(features_with_route) # Check the buckets out ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index"
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="BucketizerExample") sqlContext = SQLContext(sc) # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-0.5,), (-0.3,), (0.0,), (0.2,)] dataFrame = sqlContext.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) bucketedData.show() # $example off$ sc.stop()