def discrete(self):
        # Bucketizer
        from pyspark.ml.feature import Bucketizer

        splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

        data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
        dataFrame = self.session.createDataFrame(data, ["features"])

        bucketizer = Bucketizer(splits=splits,
                                inputCol="features",
                                outputCol="bucketedFeatures")

        # Transform original data into its bucket index.
        bucketedData = bucketizer.transform(dataFrame)

        print("Bucketizer output with %d buckets" %
              (len(bucketizer.getSplits()) - 1))
        bucketedData.show()

        # QuantileDiscretizer

        data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
        df = self.createDataFrame(data, ["id", "hour"])

        discretizer = QuantileDiscretizer(numBuckets=3,
                                          inputCol="hour",
                                          outputCol="result")

        result = discretizer.fit(df).transform(df)
        result.show()
示例#2
0
    def _compute_hist(sdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        colname = sdf.columns[-1]

        bucket_name = "__{}_bucket".format(colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(
            splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
        )
        # after bucketing values, groups and counts them
        result = (
            bucketizer.transform(sdf)
            .select(bucket_name)
            .groupby(bucket_name)
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=bucket_name)
        )

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]})
        # merges the bins with counts on it and fills remaining ones with zeros
        pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]]
        pdf.columns = [bucket_name]

        return pdf[bucket_name]
def create_buckets(percentage_of_missing_ctus_per_partyid):
    """
    Devide party ids by percentage of missing ctus into a list of 5 buckets
    > 0   < 0.25
    > 0.25 < 0.5
    > 0.5 < 0.75
    > 0.75 < 0.99
    > 0.99
    Output:
    +--------+-----------------------+-------+
    |party_id|percentage_missing_ctus|buckets|
    +--------+-----------------------+-------+
    |       1|                    0.2|    0.0|
    |       2|                   0.33|    1.0|
    |       3|                    1.0|    4.0|
    |       4|                   0.75|    3.0|
    |       5|                    0.6|    2.0|
    |       6|                    0.6|    2.0|
    +--------+-----------------------+-------+
    """

    bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \
                            inputCol="percentage_missing_ctus", outputCol="buckets")
    df_of_buckets_ratio_between_imputed_distinct_ctus\
            = bucketizer.setHandleInvalid("keep").\
            transform(percentage_of_missing_ctus_per_partyid)
    return df_of_buckets_ratio_between_imputed_distinct_ctus
    def get_binned_stat(self, df, colname, col_stat, n_split=10):

        splits = CommonUtils.frange(col_stat["min"],
                                    col_stat["max"],
                                    num_steps=n_split)
        splits = sorted(splits)
        splits_range = [(splits[idx], splits[idx + 1])
                        for idx in range(len(splits) - 1)]

        splits_data = {"splits": splits, "splits_range": splits_range}
        splits = splits_data["splits"]
        double_df = df.withColumn(colname, df[colname].cast(DoubleType()))
        bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX")
        bucketizer.setSplits(splits)
        binned_df = bucketizer.transform(double_df)
        histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas()
        str_splits_range = [
            " to ".join([str(x[0]), str(x[1])]) for x in splits_range
        ]
        bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range))
        bin_name_dict[n_split] = "null"
        histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply(
            lambda x: n_split if pd.isnull(x) else x)
        histogram_df["bins"] = histogram_df["orderIndex"].apply(
            lambda x: bin_name_dict[int(x)])
        relevant_df = histogram_df[["bins", "count", "orderIndex"]]
        histogram_dict = relevant_df.T.to_dict().values()
        histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"])
        output = []
        for val in histogram_dict:
            output.append({"name": val["bins"], "value": val["count"]})
        return output
示例#5
0
    def test_bucketizer(self):
        values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )]
        data = self.spark.createDataFrame(values, ["features"])
        model = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
                                   float("inf")],
                           inputCol="features",
                           outputCol="buckets")

        feature_count = len(data.select('features').first())
        model_onnx = convert_sparkml(
            model, 'Sparkml Bucketizer',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.setHandleInvalid("error").transform(data)
        expected = predicted.select("buckets").toPandas().values.astype(
            numpy.float32)
        data_np = [data.toPandas().values.astype(numpy.float32)]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['buckets'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
示例#6
0
文件: plot.py 项目: mercileesb/koalas
    def calc_histogram(self, bins):
        bucket_name = '__{}_bucket'.format(self.colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(splits=bins,
                                inputCol=self.colname,
                                outputCol=bucket_name,
                                handleInvalid="skip")
        # after bucketing values, groups and counts them
        result = (bucketizer.transform(
            self.data._kdf._sdf).select(bucket_name).groupby(bucket_name).agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    by=bucket_name))

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({
            bucket_name: np.arange(0,
                                   len(bins) - 1),
            'bucket': bins[:-1]
        })
        # merges the bins with counts on it and fills remaining ones with zeros
        data = indexes.merge(result, how='left',
                             on=[bucket_name]).fillna(0)[['count']]
        data.columns = [bucket_name]

        return data
    def get_column_hist(self, column, bins):
        """return a list of counts corresponding to bins"""
        bins = list(copy.deepcopy(bins))  # take a copy since we are inserting and popping
        if bins[0] == -np.inf or bins[0] == -float("inf"):
            added_min = False
            bins[0] = -float("inf")
        else:
            added_min = True
            bins.insert(0, -float("inf"))

        if bins[-1] == np.inf or bins[-1] == float("inf"):
            added_max = False
            bins[-1] = float("inf")
        else:
            added_max = True
            bins.append(float("inf"))

        temp_column = self.spark_df.select(column).where(col(column).isNotNull())
        bucketizer = Bucketizer(
            splits=bins, inputCol=column, outputCol="buckets")
        bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column)

        # This is painful to do, but: bucketizer cannot handle values outside of a range
        # (hence adding -/+ infinity above)

        # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound
        # for all but the last bin

        # But, since the last bin in our case will often be +infinity, we need to
        # find the number of values exactly equal to the upper bound to add those

        # We'll try for an optimization by asking for it at the same time
        if added_max == True:
            upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count()
        else:
            upper_bound_count = 0

        hist_rows = bucketed.groupBy("buckets").count().collect()
        # Spark only returns buckets that have nonzero counts.
        hist = [0] * (len(bins) - 1)
        for row in hist_rows:
            hist[int(row["buckets"])] = row["count"]

        hist[-2] += upper_bound_count

        if added_min:
            below_bins = hist.pop(0)
            bins.pop(0)
            if below_bins > 0:
                logger.warning("Discarding histogram values below lowest bin.")

        if added_max:
            above_bins = hist.pop(-1)
            bins.pop(-1)
            if above_bins > 0:
                logger.warning("Discarding histogram values above highest bin.")

        return hist
示例#8
0
def model_train(zipcode, complaint, day):
    print("Loading Data ...")
    data311 = spark.read.format("csv").option("header",
                                              "true").load("Data_Final/*.csv")
    infer_schema = "true"
    first_row_is_header = "true"
    delimiter = ","
    data311.registerTempTable("data311")
    data311 = data311.withColumn("ResTimeH",
                                 data311.Resolution_Time_Hours.cast('int'))
    data311 = data311.withColumn('day_of_week',
                                 dayofweek(data311['Created Date']))
    data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int'))
    data311 = data311.filter(data311.ResTimeH > 0)
    data311 = data311.filter(data311.ResTimeH < 99)
    bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')],
                            inputCol="ResTimeH",
                            outputCol="categories")
    data311 = bucketizer.setHandleInvalid("keep").transform(data311)
    X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories']
    X = X.filter(X["Zip"].isNotNull())
    X = X.filter(X["Complaint_Type_Groups"].isNotNull())
    X = X.filter(X["day_of_week"].isNotNull())

    stage_1 = StringIndexer(inputCol="Complaint_Type_Groups",
                            outputCol="categoryIndex")
    stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"],
                                     outputCols=["categoryVec"])
    stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'],
                              outputCol="features")
    stage_4 = StandardScaler().setInputCol("features").setOutputCol(
        "Scaled_ip_features")
    stage_5 = LogisticRegression(labelCol="categories",
                                 featuresCol="Scaled_ip_features")
    # setup the pipeline
    pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5])
    # fit the pipeline model and transform the data as defined
    pipeline_model = pipeline.fit(X)

    zipcode = int(zipcode)
    day = int(day)
    input_variables = pd.DataFrame(
        [[zipcode, complaint, day]],
        columns=['Zip', 'Complaint_Type_Groups', 'day_of_week'])
    input_variables = spark.createDataFrame(input_variables)

    transformed = pipeline_model.transform(input_variables)
    ans = transformed.select(collect_list('prediction')).first()[0]

    if (ans[0] == 0.0):
        prediction = "Your complaint will be resolved within 2 hours."
    elif (ans[0] == 1.0):
        prediction = "Your complaint will be resolved within 2-6 hours."
    else:
        prediction = "Your complaint will be resolved after 6 hours"
    return prediction
示例#9
0
def transform_spark(data, columns, args, transformed_column_name):
    from pyspark.ml.feature import Bucketizer
    import pyspark.sql.functions as F

    new_b = Bucketizer(
        splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name
    )
    return new_b.transform(data).withColumn(
        transformed_column_name, F.col(transformed_column_name).cast("int")
    )
示例#10
0
def get_binned_dataframe(df, bin_name, variable_name, edges):
    '''
    Produces a dataframe with a new column `bin_name` corresponding
    to the variable `variable_name` binned with the given `edges`.
    '''
    splits = [-float('inf')]+list(edges)+[float('inf')]
    bucketizer = Bucketizer(
        splits=splits, inputCol=variable_name, outputCol=bin_name)
    binnedDF = bucketizer.transform(df)
    return binnedDF
示例#11
0
 def buckert(self, df, column):
     """
     按指定边界 分桶Bucketizer
     """
     splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')]
     # 按给定边界分桶离散化——按边界分桶
     bucketizer = Bucketizer(splits=splits,
                             inputCol=column,
                             outputCol=column + '_bucketed')  # splits指定分桶边界
     bucketedData = bucketizer.transform(df)
     print('Bucketizer output with %d buckets' %
           (len(bucketizer.getSplits()) - 1))
     return bucketedData
示例#12
0
 def test_save_and_load_on_nested_list_params(self):
     temp_path = tempfile.mkdtemp()
     splitsArray = [
         [-float("inf"), 0.5, 1.4, float("inf")],
         [-float("inf"), 0.1, 1.2, float("inf")],
     ]
     bucketizer = Bucketizer(splitsArray=splitsArray,
                             inputCols=["values", "values"],
                             outputCols=["b1", "b2"])
     savePath = temp_path + "/bk"
     bucketizer.write().overwrite().save(savePath)
     loadedBucketizer = Bucketizer.load(savePath)
     assert loadedBucketizer.getSplitsArray() == splitsArray
示例#13
0
def bucketizer_splits(dataFrame,
                      inputCol,
                      splits=[-float('inf'), -0.5, 0.0, 0.5,
                              float('inf')]):
    # 按给定边界分桶离散化——按边界分桶
    bucketizer = Bucketizer(splits=splits,
                            inputCol=inputCol,
                            outputCol='%s_bucketizer' %
                            (inputCol))  # splits指定分桶边界
    bucketedData = bucketizer.transform(dataFrame)
    print('Bucketizer output with %d buckets' %
          (len(bucketizer.getSplits()) - 1))
    return bucketedData
def pre_processing(dataFrame):

    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" %
          (len(bucketizer.getSplits()) - 1))
    bucketedData.show()
示例#15
0
def add_age_id(spark, df, logger):
    """Calculate the age_id by splitting the visitor age into buckets"""
    agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21,
                                        26, 36, 46, 56, 66, float('Inf') ],
                                inputCol="i94bir",
                                outputCol="agebuckets")
    agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df)
    age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\
                                                .otherwise(col("agebuckets")
                                                .cast(IntegerType()))
                                    )
    logger.info("Added age_id")
    age_id_df.persist()
    return age_id_df
示例#16
0
def main_emm_recode_demos(emm_raw_sdf):

    recode_demo_pipeline = Pipeline(stages=[
        Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age1"),
        Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age7"),
        Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age8"),
        #        Bucketizer(splits=[-25, 0, 25., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income1"),
        #        Bucketizer(splits=[-25, 0, 25., 35., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income9"),
        IfElseTransformer(
            vals=[83], inputCol='hispanicid', outputCol='hispanic'),
        IfElseTransformer(
            vals=['M'], inputCol='gender_char', outputCol='gender'),
        IfElseTransformer(vals=[86], inputCol='raceid', outputCol='race_back'),
        IfElseTransformer(vals=[88], inputCol='raceid',
                          outputCol='race_asian'),
        YesNoTransformer(inputCol='dvr_flag', outputCol='dvr'),
        YesNoTransformer(inputCol='cable_plus_flag', outputCol='cableplus'),
        YesNoTransformer(inputCol='video_game_owner_flag',
                         outputCol='video_game'),
        YesNoTransformer(inputCol='internet_access_flag',
                         outputCol='internet'),
        YesNoTransformer(inputCol='pay_cable_flag', outputCol='paycable'),
        YesNoTransformer(
            inputCol='television_high_definition_display_capability_flag',
            outputCol='hdtv'),
        YesNoTransformer(inputCol='alternative_delivery_flag',
                         outputCol='satellite'),
        IsInTransformer(isin_bins=[[0, 1], [2], [3, 4, 5, 6, 7], [8]],
                        inputCol='nielsen_occupation_code',
                        outputCol='occupation1'),
        IsInTransformer(isin_bins=[[0, 8, 9, 10, 11, 12], [13, 14, 15], [16],
                                   [18, 19, 20]],
                        inputCol='education_level_number',
                        outputCol='education7'),
        IsInTransformer(isin_bins=[[16, 18, 19, 20],
                                   [0, 8, 9, 10, 11, 12, 13, 14, 15]],
                        inputCol='education_level_number',
                        outputCol='education2'),
        IsInTransformer(isin_bins=[['A'], ['B'], ['C'], ['D']],
                        inputCol='county_size_code',
                        outputCol='county_size')
    ])

    return None
示例#17
0
 def _bucketize_age_column(
         self, dataframe: DataFrame, input_col: str,
         output_col: str) -> Tuple[DataFrame, int, List[str]]:
     bucketizer = Bucketizer(splits=self.age_groups,
                             inputCol=input_col,
                             outputCol=output_col)
     output = bucketizer.setHandleInvalid("keep").transform(dataframe)
     splits = [s for s in bucketizer.getSplits()]
     mapping = [
         "[{}, {})".format(splits[i], splits[i + 1])
         for i in range(len(splits) - 1)
     ]
     n_age_groups = len(mapping)
     return output, n_age_groups, mapping
示例#18
0
def add_duration_id(spark, df, logger):
    """Calculate the visitduration_id by splitting the visit duration into buckets"""
    durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate"))
    ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22,
                                        29, float('Inf') ],
                                        inputCol="duration_days",
                        outputCol="ddbuckets")
    ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df)
    dur_id_df = ddbuck_df.withColumn("visitduration_id",
                                   when(isnull(col("arrdate")) |
                                        isnull(col("depdate")), 999)\
                                   .otherwise(col("ddbuckets").cast(IntegerType()))
                                 )
    logger.info("Added duration_id")
    return dur_id_df
示例#19
0
    def _transform_data(self, data):
        data_handling = self.data_settings.get('data_handling', {})

        # interactions
        if data_handling.get('interactions', False):
            columns_list = list(data.columns)
            columns_list.remove(self.model_settings['variable_to_predict'])
            for col1 in columns_list:
                for col2 in columns_list:
                    if col1 != col2:
                        name = str(col1) + '_' + str(col2)
                        reverse_name = str(col2) + '_' + str(col1)
                        if reverse_name not in list(data.columns):
                            data = data.withColumn(name, (F.col(col1) + 1) *
                                                   (F.col(col2) + 1))

        # binning
        for feature_to_bin in data_handling.get("features_to_bin", []):
            min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0]
            max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0]
            full_bins = [(min_val - 1)
                         ] + feature_to_bin['bins'] + [(max_val + 1)]

            bucketizer = Bucketizer(splits=full_bins,
                                    inputCol=feature_to_bin['name'],
                                    outputCol=feature_to_bin['name'] +
                                    '_binned')

            data = bucketizer.transform(data)

        # transformation
        for col in data_handling.get("features_handling", {}).keys():
            transformation_array = data_handling["features_handling"][col].get(
                "transformation", [])
            # applying transformations
            for feature_transformation_method in transformation_array:
                data = data.withColumn(
                    col + '_' + feature_transformation_method,
                    eval('F.' + feature_transformation_method)(col))

        # dropping features
        features_to_remove = data_handling.get('features_to_remove', [])
        if len(features_to_remove) > 0:
            data = data.drop(*[
                feature for feature in features_to_remove
                if feature in data.columns
            ])
        return data
 def bucketize(self, df, field):
     df = df.withColumn(field, df[field].cast("double"))
     max = df.agg({field: "max"}).collect()[0][0]
     min = df.agg({field: "min"}).collect()[0][0]
     stddev = df.agg({field: "stddev"}).collect()[0][0]
     number_of_buckets = 1
     if stddev != 0:
         number_of_buckets = ((max - min) // (stddev))
     buckets = np.arange(number_of_buckets, dtype=np.float).tolist()
     buckets = [-float('inf')] + buckets + [float('inf')]
     bucketizer = Bucketizer(splits=buckets,
                             inputCol=field,
                             outputCol=field + '_bucketized')
     print("Bucketizing column: ", field)
     bucketized_features = bucketizer.transform(df)
     return bucketized_features
 def generateGroupedMeasureDataDict(self, measure_column):
     splits_data = self.get_measure_column_splits(self._data_frame,
                                                  measure_column, 4)
     splits = splits_data["splits"]
     double_df = self._data_frame.withColumn(
         measure_column,
         self._data_frame[measure_column].cast(DoubleType()))
     bucketizer = Bucketizer(inputCol=measure_column,
                             outputCol="BINNED_INDEX")
     bucketizer.setSplits(splits)
     binned_df = bucketizer.transform(double_df)
     unique_bins = binned_df.select("BINNED_INDEX").distinct().collect()
     unique_bins = [int(x[0]) for x in unique_bins]
     binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"]))
     output = {"bins": binned_index_dict, "data": binned_df}
     return output
    def OneHotEncoder(self):
        """
        Converts string-type categories to indexes, splits continuous data interval to indexes,
        encodes the categorical data using One-Hot encoding.

        """
        splits = [-float("inf"), 500, 1200, 1700, float("inf")]
        self.bucketizer = Bucketizer(
            splitsArray=[splits, splits, splits],
            inputCols=["CRSDepTime", "CRSArrTime", "DepTime"],
            outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"])

        self.varIdxer = StringIndexer(
            inputCol="OrigDest",
            outputCol="IndOrigDest").setHandleInvalid("skip")

        self.oneHot = OneHotEncoder(inputCols=[
            'Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime',
            'IndOrigDest', 'CatDepTime'
        ],
                                    outputCols=[
                                        'HotMonth', 'HotDayOfWeek',
                                        'HotCRSCatDepTime', 'HotCRSCatArrTime',
                                        'HotIndOrigDest', 'HotDepTime'
                                    ]).setHandleInvalid("keep")
def transform_data(content_items):
    content_items = content_items.withColumn('receive_date',
                                             F.to_date(
                                                 F.col('time'))).drop('time')
    bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS,
                            inputCol='days_from_eula',
                            outputCol='days_from_eula_bin',
                            handleInvalid='skip')
    content_items = bucketizer.transform(content_items) \
        .drop('days_from_eula') \
        .withColumn(
            'days_from_eula_bin',
            convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE)
        )

    print('content item data transformed')
    return content_items
示例#24
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
示例#25
0
 def bucketize(self, splits, target_col):
     self._bucket_name = 'bucket_' + target_col
     bucketizer = Bucketizer(inputCol=target_col,
                             outputCol=self._bucket_name)
     splits.sort()
     bucketizer.setSplits(splits)
     column_data_types = {
         field.name: field.dataType
         for field in self._data_frame.schema.fields
     }
     if column_data_types[target_col] != DoubleType:
         self._data_frame = self._data_frame.select(*[
             col(target_col).cast('double').alias(target_col) if column ==
             target_col else column for column in self._data_frame.columns
         ])
     self._data_frame = bucketizer.transform(self._data_frame)
     return self._bucket_name
示例#26
0
    def discretize(self, test=False):
        """
        Discretize a continous feature into a discrete one
        """

        for col in list(self.config_dict.keys()):
            # check if the discretizer transformation needs to be applied
            if self.config_dict[col]["discretize"]["apply"]:
                splits = self.config_dict[col]["discretize"]["value"]
                splits = [-math.inf] + splits
                splits = splits + [math.inf]
                bucketizer = Bucketizer(splits=splits,
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                if test:
                    self.test_data = bucketizer.transform(self.test_data)
                else:
                    self.train_data = bucketizer.transform(self.train_data)
示例#27
0
 def bin_columns(self, colsToBin):
     for bincol in colsToBin:
         if self._pandas_flag:
             try:
                 minval, maxval = float(min(
                     self._data_frame[bincol])), float(
                         max(self._data_frame[bincol]))
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 self._data_frame[bincol] = pd.cut(
                     self._data_frame[bincol],
                     bins=splitsData["splits"],
                     labels=list(splitsData['bin_mapping'].values()),
                     right=True,
                     include_lowest=True)
             except Exception as e:
                 print("Binning failed for : ", bincol)
         else:
             try:
                 minval, maxval = self._data_frame.select([
                     FN.max(bincol).alias("max"),
                     FN.min(bincol).alias("min")
                 ]).collect()[0]
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 splits = splitsData["splits"]
                 self._data_frame = self._data_frame.withColumn(
                     bincol, self._data_frame[bincol].cast(DoubleType()))
                 bucketizer = Bucketizer(inputCol=bincol,
                                         outputCol="BINNED_INDEX")
                 bucketizer.setSplits(splits)
                 self._data_frame = bucketizer.transform(self._data_frame)
                 mapping_expr = create_map([
                     lit(x) for x in chain(
                         *list(splitsData["bin_mapping"].items()))
                 ])
                 # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ")
                 self._data_frame = self._data_frame.withColumn(
                     bincol, mapping_expr.getItem(col("BINNED_INDEX")))
                 self._data_frame = self._data_frame.select(self.columns)
             except Exception as e:
                 print("Binning failed for : ", bincol)
示例#28
0
def strat_histogram(sdf, colname, bins=10, categorical=False):
    if categorical:
        result = sdf.cols[colname]._value_counts(dropna=False, raw=True)

        if hasattr(result.index, 'levels'):
            indexes = pd.MultiIndex.from_product(
                result.index.levels[:-1] +
                [result.reset_index()[colname].unique().tolist()],
                names=result.index.names)
            result = (pd.DataFrame(index=indexes).join(
                result.to_frame(),
                how='left').fillna(0)[result.name].astype(result.dtype))

        start_values = result.index.tolist()
    else:
        bucket_name = '__{}_bucket'.format(colname)
        strata = sdf._handy.strata_colnames
        colnames = strata + ensure_list(bucket_name)

        start_values = np.linspace(
            *sdf.agg(F.min(colname),
                     F.max(colname)).rdd.map(tuple).collect()[0], bins + 1)
        bucketizer = Bucketizer(splits=start_values,
                                inputCol=colname,
                                outputCol=bucket_name,
                                handleInvalid="skip")
        result = (
            bucketizer.transform(sdf).select(colnames).groupby(colnames).agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    by=colnames))

        indexes = pd.DataFrame({
            bucket_name: np.arange(0, bins),
            'bucket': start_values[:-1]
        })
        if len(strata):
            indexes = (indexes.assign(key=1).merge(
                result[strata].drop_duplicates().assign(key=1),
                on='key').drop(columns=['key']))
        result = indexes.merge(result, how='left', on=strata +
                               [bucket_name]).fillna(0)[strata +
                                                        [bucket_name, 'count']]

    return start_values, result
示例#29
0
 def test_list_list_float(self):
     b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
     self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
     self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
     self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
     self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
     self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
     self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
示例#30
0
def calc(df, column: str, bins=50, bin_width=None):
    """
    Calculate the buckets and weights for a histogram

    Returns
    -------
        (buckets, weights): tuple of two lists
    """
    if bins is None and bin_width is None:
        raise ValueError("Must indicate bins or bin_width")
    elif bins is None and bin_width is not None:
        raise ValueError("bins and bin_width arguments are mutually exclusive")

    # Calculate buckets
    data = df[[column]]

    int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType)
    col_type = data.schema.fields[0].dataType
    if not isinstance(col_type, int_types):
        raise ValueError(
            "hist method requires numerical or datetime columns, nothing to plot."
        )

    # Calculate buckets
    buckets = utils.spark_buckets(data, column, bins=bins, bin_width=bin_width)

    # Calculate counts based on the buckets
    bucketizer = Bucketizer(splits=buckets, inputCol=column, outputCol="bucket")
    buckets_df = bucketizer.transform(data)

    histogram = buckets_df.groupby("bucket").agg(F.count(column).alias("count"))
    histogram = histogram.orderBy("bucket", ascending=True)

    # Create weights (locally)
    hist_pd = histogram.toPandas()

    # Create a new DF with complete buckets and empty counts if needed
    full_buckets = pd.DataFrame(columns=["bucket"])
    full_buckets["bucket"] = np.arange(len(buckets))
    full_buckets = full_buckets.merge(hist_pd, on="bucket", how="left")
    weights = full_buckets["count"]

    return buckets, weights
# Add a category column via pyspark.sql.DataFrame.withColumn
manual_bucketized_features = features_with_route.withColumn(
  "ArrDelayBucket",
  dummy_function_udf(features['ArrDelay'])
)
manual_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

#
# Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay
#
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
bucketizer = Bucketizer(
  splits=splits,
  inputCol="ArrDelay",
  outputCol="ArrDelayBucket"
)
ml_bucketized_features = bucketizer.transform(features_with_route)

# Check the buckets out
ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

#
# Extract features tools in with pyspark.ml.feature
#
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Turn category fields into categoric feature vectors, then drop intermediate fields
for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
               "Origin", "Dest", "Route"]:
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()


# COMMAND ----------

contDF = spark.range(20).selectExpr("cast(id as double)")


# COMMAND ----------

from pyspark.ml.feature import Bucketizer
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
示例#35
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BucketizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
    dataFrame = sqlContext.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)
    bucketedData.show()
    # $example off$

    sc.stop()
示例#36
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BucketizerExample")\
        .getOrCreate()

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
    dataFrame = spark.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
    bucketedData.show()
    # $example off$

    spark.stop()
示例#37
0
 def test_list_float(self):
     b = Bucketizer(splits=[1, 4])
     self.assertEqual(b.getSplits(), [1.0, 4.0])
     self.assertTrue(all([type(v) == float for v in b.getSplits()]))
     self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))