예제 #1
0
class AggByAmount(BaseEstimator, TransformerMixin):
    # Inputs: bins, encode, strategy ('uniform', 'quantile', 'kmeans'), number of top features, mean/max/min
    # Top features order: ['v1', 'v4', 'v10', 'v7', 'v18', 'v11', 'v20', 'amount', 'v3', 'v16', 'v13', 'v14', 'v8', 'v9', 'v19', 'v2', 'v5', 'v12', 'v26', 'v24', 'v25', 'v27', 'v17', 'v22', 'v23', 'v6', 'v15', 'v21']
    def __init__(self, n_bins=2, strategy='quantile', columns_to_agg=['v1']):
        self.n_bins = n_bins
        self.strategy = strategy
        self.columns_to_agg = columns_to_agg
        self.kbins = None
        self.initial_columns = None
        self.agg_values = None

    def fit(self, X, y=None):
        self.kbins = KBinsDiscretizer(n_bins=self.n_bins,
                                      encode='ordinal',
                                      strategy=self.strategy)
        self.kbins.fit(X[['amount']].values)
        self.initial_columns = list(X.columns)
        X['amount_discretized'] = self.kbins.transform(X[['amount']].values)
        self.agg_values = X.groupby(by=['amount_discretized']).mean()
        self.agg_values = self.agg_values[self.columns_to_agg]
        self.agg_values.columns = [
            x + "_mean_given_amount" for x in self.agg_values.columns
        ]
        return self

    def transform(self, X, y=None):
        X['amount_discretized'] = self.kbins.transform(X[['amount']].values)
        X = X.merge(self.agg_values, how='left', on=['amount_discretized'])
        X.drop(self.initial_columns + ['amount_discretized'],
               axis=1,
               inplace=True)
        return X
예제 #2
0
def test_redundant_bins(strategy, expected_bin_edges):
    X = [[0], [0], [0], [0], [3], [3]]
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
    warning_message = "Consider decreasing the number of bins."
    with pytest.warns(UserWarning, match=warning_message):
        kbd.fit(X)
    assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
예제 #3
0
class DiscretizeTransformer(object):
    """Discretize continuous columns into several bins.
    Transformation result is a int array."""
    def __init__(self, meta, n_bins):
        self.meta = meta
        self.c_index = [
            id for id, info in enumerate(meta) if info['type'] == CONTINUOUS
        ]
        self.kbin_discretizer = KBinsDiscretizer(n_bins=n_bins,
                                                 encode='ordinal',
                                                 strategy='uniform')

    def fit(self, data):
        if self.c_index == []:
            return
        self.kbin_discretizer.fit(data[:, self.c_index])

    def transform(self, data):
        if self.c_index == []:
            return data.astype('int')

        data_t = data.copy()
        data_t[:, self.c_index] = self.kbin_discretizer.transform(
            data[:, self.c_index])
        return data_t.astype('int')

    def inverse_transform(self, data):
        if self.c_index == []:
            return data

        data_t = data.copy().astype('float32')
        data_t[:, self.c_index] = self.kbin_discretizer.inverse_transform(
            data[:, self.c_index])
        return data_t
예제 #4
0
    def preprocess(self, X, method):
        """
		Preprocess the data by scaling into the range of 0-1 with bins.
		"""
        if method == "bucket":  # scales into 0-1 range with bins
            print("using the bucket prep method")
            from sklearn.preprocessing import KBinsDiscretizer
            est = KBinsDiscretizer(n_bins=10,
                                   encode="ordinal",
                                   strategy="quantile")
            est.fit(X)
            X_processed = est.transform(X)
            X_processed /= 10  # transform from nominal values to 0-1
            return X_processed
        elif method == "clip":  # clips the raw counts into a certain range
            print("using the clip prep method")
            cutoff = 1000
            X_processed = np.minimum(X, cutoff) + np.sqrt(
                np.maximum(X - cutoff, 0))
            return X_processed
        elif method == "log":  # takes the log of the count
            print("using the log prep method")
            import numpy.ma as ma
            mask = ma.log(X)
            # mask logged data to replace NaN (log0) with 0
            X_processed = ma.fix_invalid(mask, fill_value=0).data
            return X_processed
        else:
            raise Exception("Incorrect preprocess method name passed!")
예제 #5
0
class KBinsDiscretizer(Transformer):
    def __init__(self, n_bins=3, strategy='uniform'):
        super().__init__("discretizer", 24)
        self.input_type = NUMERICAL
        self.output_type = DISCRETE
        self.compound_mode = 'in_place'
        self.n_bins = n_bins
        self.strategy = strategy

    @ease_trans
    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.preprocessing import KBinsDiscretizer

        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(input_datanode.feature_types, self.input_type)
        X_new = X[:, target_fields]

        if not self.model:
            self.model = KBinsDiscretizer(
                n_bins=self.n_bins, encode='ordinal', strategy=self.strategy)
            self.model.fit(X_new)
        _X = self.model.transform(X_new)
        return _X

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()
        n_bins = UniformIntegerHyperparameter('n_bins', 2, 20, default_value=5)
        cs.add_hyperparameters([n_bins])
        return cs
예제 #6
0
def test_KBinsDiscretizer_ordinal():
    expected = pd.DataFrame({"name": numeric, "feature": numeric})

    kbin = KBinsDiscretizer(n_bins=5, encode="ordinal")
    kbin.fit(X[numeric])

    assert feat(kbin, numeric).equals(expected)
예제 #7
0
def q2():
    discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal')
    discretizer.fit(countries[['Pop_density']])

    discretized_pop_density = discretizer.transform(countries[['Pop_density']])

    return int(np.sum(discretized_pop_density >= 9))
def feature_eng(X):
    X['Title'] = X['Name'].apply(get_title)
    X['Title'] = X['Title'].fillna('Miss')
    X['Title'] = X['Title'].apply(replace_titles)
    X.loc[X.Age.isnull(), 'Age'] = X.groupby(['Sex','Pclass','Title']).Age.transform('median')
    X['Pclass'] = X['Pclass'].apply(lambda x: 'first' if x==1 else 'second' if x==2 else 'third')
    binner = KBinsDiscretizer(encode='ordinal')
    binner.fit(X[['Age']])
    X['AgeBins'] = binner.transform(X[['Age']])
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 
                  5: 'Large', 6: 'Large', 7: 'Large', 8: 'Large', 11: 'Large'}
    X['GroupSize'] = X['FamilySize'].map(family_map)
    X['WithFamily'] = (X['FamilySize']>1)
    X['WithFamily'] = X['WithFamily'].apply(lambda x: 'yes' if x==1 else 'no')
    X.loc[(X.Fare.isnull()), 'Fare'] = X.Fare.median()
    X.loc[(X.Fare==0), 'Fare'] = X.Fare.median()
    binner.fit(X[['Fare']])
    X['FareBins'] = binner.transform(X[['Fare']])
    X["Deck"] = X["Cabin"].str.slice(0,1)
    X["Deck"] = X["Deck"].fillna("N")
    idx = X[X['Deck'] == 'T'].index
    X.loc[idx, 'Deck'] = 'A'
    X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
    X.drop('PassengerId', axis=1, inplace=True)
    X.drop('Ticket', axis=1, inplace=True)
    X.drop('Name', axis=1, inplace=True)
    return X
예제 #9
0
def main(dataset_name: str):

    train = load_jsonlines('../data/' + dataset_name + '/lm/train.jsonl')
    valid = load_jsonlines('../data/' + dataset_name + '/lm/valid.jsonl')
    data = train + valid

    amounts = []

    for d in data:
        amounts.extend(d['amounts'])

    amounts = np.array(amounts)
    amounts = amounts.reshape(-1, 1)

    dis = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='quantile')
    dis.fit(amounts)

    with open('presets/' + dataset_name + '/discretizers/100_quantile',
              'wb') as f:
        pickle.dump(dis, f)

    dis = KBinsDiscretizer(n_bins=50, encode='ordinal', strategy='quantile')
    dis.fit(amounts)

    with open('presets/' + dataset_name + '/discretizers/50_quantile',
              'wb') as f:
        pickle.dump(dis, f)

    return
예제 #10
0
def discretizer(df_list, names, config):
    """
    concat the df_list as one pd.DataFrame then using sklean.KBinsDiscretizer depart it,

    Parameters
    ----------
    df_list: object
        a collection of one or more pd.DataFrame. they must have one column named 'id' for indexing.
    names : list
        a list of the continuous variable column's name. If it's none,checking if all columns are continuous and
        discrete the continuous variable columns.
 
    config : object
        the object of parameters

    Returns
    ----------
    df_list_t : object
        the df_list after trans ,still is the collection of pd.DataFrame
    """

    data = concat_df_list(df_list, config.index_col)

    if names is None:
        names, _ = get_continue_feature(data)

    for name in names:
        kbdis = KBinsDiscretizer(n_bins=config.n_bins,
                                 encode="ordinal",
                                 strategy=config.method)
        kbdis.fit(data[name])
        data.loc[:, name + "_discred"] = kbdis.transform(data[name])

    df_list_t = retrun_df_list(df_list, data, config)
    return df_list_t
예제 #11
0
def test_invalid_encode_option():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
    err_msg = (r"Valid options for 'encode' are "
               r"\('onehot', 'onehot-dense', 'ordinal'\). "
               r"Got encode='invalid-encode' instead.")
    with pytest.raises(ValueError, match=err_msg):
        est.fit(X)
예제 #12
0
def q2():
    # Retorne aqui o resultado da questão 2.
    kbins_discretizer = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
    kbins_discretizer.fit(countries[['Pop_density']])
    bins_score = kbins_discretizer.transform(countries[['Pop_density']])
    return int((bins_score >= 9).sum())
    pass
예제 #13
0
def test_invalid_strategy_option():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
    err_msg = (r"Valid options for 'strategy' are "
               r"\('uniform', 'quantile', 'kmeans'\). "
               r"Got strategy='invalid-strategy' instead.")
    with pytest.raises(ValueError, match=err_msg):
        est.fit(X)
def discrete_state(_, __, pole_angle, pole_vel) -> Tuple[int, ...]:
    """Convert continues state intro a discrete state"""
    est = KBinsDiscretizer(n_bins=num_bins,
                           encode='ordinal',
                           strategy='uniform')
    est.fit([l_bounds, u_bounds])
    return tuple(map(int, est.transform([[pole_angle, pole_vel]])[0]))
예제 #15
0
def q2():
    est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    est.fit(df[['Pop_density']])
    popdensitycont = est.transform(df[["Pop_density"]])
    p90 = math.trunc((90 / 100) * len(popdensitycont))

    return len(popdensitycont) - p90
def test_kbinsdiscretizer_subsample_warn():
    X = np.random.rand(200001, 1).reshape(-1, 1)
    kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")

    msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
    with pytest.warns(FutureWarning, match=msg):
        kbd.fit(X)
예제 #17
0
def preprocess_data(dataset, prune_experiments=False, bins=None):

    # Prune rows at beginning of experiments where speed and steering are too small.
    if prune_experiments:
        pruned_dataset = []
        current_exp = dataset[0][0]
        current_exp_has_started = False
        for row in dataset:
            # Check if we just changed experiment.
            exp = row[0]
            if (exp != current_exp):
                current_exp = exp
                current_exp_has_started = False

            # Check if the ball has started to move.
            if abs(row[8]) >= 5 or abs(row[9]) >= 5:
                current_exp_has_started = True
            # If ball has started to move then add row
            if current_exp_has_started:
                pruned_dataset.append(row)

        # Set dataset to pruned version.
        dataset = np.array(pruned_dataset)

    # Create input matrix X.
    pos_X = np.array(dataset[:, 2:4], dtype='float64')
    quat_X = np.array(dataset[:, 4:8], dtype='float64')
    euler_X = np.matrix(
        [quaternion_to_euler(q[0], q[1], q[2], q[3]) for q in quat_X])

    # Join blocks.
    X = np.concatenate((pos_X, quat_X, euler_X), axis=1)

    # Normalize X.
    scalerX = MinMaxScaler()
    scalerX.fit(X)
    X = scalerX.transform(X)

    # Create target matrix Y.
    speed_y = np.array(dataset[:, 8], dtype='float64')
    steering_y = np.array(dataset[:, 9], dtype='float64')

    # Join blocks.
    Y = np.column_stack((speed_y, steering_y))

    # Classification.
    # This has been UNTESTED.
    if bins != None:
        scalerY = KBinsDiscretizer(n_bins=bins,
                                   encode='ordinal',
                                   strategy='uniform')
        scalerY.fit(Y)
        Y = scalerY.transform(Y)
    else:
        # Normalize Y.
        scalerY = MinMaxScaler()
        scalerY.fit(Y)
        Y = scalerY.transform(Y)

    return X, Y, scalerX, scalerY
예제 #18
0
    def bins(self, data, labels):
        bins = []
        for feature in self.to_discretize:
            x = np.reshape(data[:, feature], (-1, 1))
            model = KMeans()
            visualizer = KElbowVisualizer(model, k=(3,12))
            visualizer.fit(x)
            n_bins = visualizer.elbow_value_
            kmeans = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
            if self.filename != "":
                plt.show(block=False)
                plt.savefig(self.filename + "/" + str(self.feature_names[feature]) + "_elbow_visualisation.png")
                plt.close('all')
            kmeans.fit(x)
            qts = []
            for biner in kmeans.bin_edges_:
                qts.append(biner)
            qts = np.array(qts)

            if qts.shape[0] == 0:
                qts = np.array([np.median(data[:, feature])])
            else:
                qts = np.sort(qts)

            bins.append(qts)
        return bins
예제 #19
0
def q2():
    from sklearn.preprocessing import KBinsDiscretizer
    k_bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    k_bins.fit(countries[['Pop_density']])
    bins = k_bins.transform(countries[['Pop_density']])
   
    return  int(sum(bins[:, 0] >= 9))
예제 #20
0
def KBinsDiscretizer_continuos(dt, attributes=None, bins=3):
    attributes = dt.columns if attributes is None else attributes
    continuous_attributes = [
        a for a in attributes if dt.dtypes[a] != np.object
    ]
    X_discretize = dt[attributes].copy()

    for col in continuous_attributes:
        if len(dt[col].value_counts()) > 10:
            from sklearn.preprocessing import KBinsDiscretizer
            est = KBinsDiscretizer(n_bins=bins,
                                   encode='ordinal',
                                   strategy='quantile')
            est.fit(dt[[col]])
            edges = [i.round() for i in est.bin_edges_][0]
            edges = [int(i) for i in edges][1:-1]
            if len(set(edges)) != len(edges):
                edges = [
                    edges[i] for i in range(0, len(edges))
                    if len(edges) - 1 == i or edges[i] != edges[i + 1]
                ]
            for i in range(0, len(edges)):
                if i == 0:
                    data_idx = dt.loc[dt[col] <= edges[i]].index
                    X_discretize.loc[data_idx, col] = f"<={edges[i]}"
                if i == len(edges) - 1:
                    data_idx = dt.loc[dt[col] > edges[i]].index
                    X_discretize.loc[data_idx, col] = f">{edges[i]}"

                data_idx = dt.loc[(dt[col] > edges[i - 1])
                                  & (dt[col] <= edges[i])].index
                X_discretize.loc[data_idx, col] = f"({edges[i-1]}-{edges[i]}]"
        else:
            X_discretize[col] = X_discretize[col].astype('object')
    return X_discretize
class KBinsDiscreteFeatureExtractor(BaseTransformer):
    """
    'uniform', 'quantile', 'kmeans'
    """
    def __init__(self, cols, n_bins=10, encode="ordinal", strategy="uniform"):
        self.cols = cols
        self.n_bins = n_bins if type(n_bins) != int else [
            n_bins for _ in range(len(cols))
        ]
        self.kbin = KBinsDiscretizer(n_bins=self.n_bins,
                                     encode=encode,
                                     strategy=strategy)
        self.encode = encode
        self.strategy = strategy

    def _get_column_names(self):
        return [
            f"k{bins}bins{self.strategy}category_{self.cols[i]}"
            for i, bins in zip(range(len(self.cols)), self.n_bins)
        ]

    def fit(self, X):
        self.kbin.fit(X[self.cols].fillna(0))

    def transform(self, X):
        df = pd.DataFrame()
        value = self.kbin.transform(X[self.cols].fillna(0))
        for i, bins in zip(range(len(self.cols)), self.n_bins):
            df[f"k{bins}bins{self.strategy}category_{self.cols[i]}"] = value[:,
                                                                             i]
        return df
예제 #22
0
def popularity_classification(wgt_pop,
                              n_bins=5,
                              encode='ordinal',
                              strategy='quantile'):
    """"
    Discretize continuous popularity probability into intervals (Feature Binarization).
    
    Args:
    wgt_pop:    df. Output from calc_popularity()
    nbins:      int. Number of classes/bins
    encode:     'onehot', 'onehot-dense', 'ordinal' where the latest returns the bin identifier encoded as an integer value.
    strategy:   'uniform', 'kmeans', 'quantile' where the latest distributes all points equally between the bins (all bins have equal amount of points).
    
    Returns:
    df with popularity and popularity_class
    """
    # extract popularity-values as 1D vector
    X = np.array(wgt_pop).reshape(-1, 1)

    # Configure and fit Binarizer
    enc = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    enc.fit(X)
    x_encoded = pd.DataFrame(enc.transform(X))

    # Append the bin-identifier as additional row to the original dataframe
    wgt_pop_df = pd.DataFrame(wgt_pop.copy())
    wgt_pop_df['popularity_class'] = [int(x) + 1 for x in x_encoded.values
                                      ]  # +1 to start at 1 and not 0

    return wgt_pop_df
예제 #23
0
def _get_bins(num_bins: int, values: List[Number]):
    """
    Perform equal frequency binning (i.e. quantile binning) to bin the values
    into `num_bins` bins.

    Parameters
    ----------
    num_bins
    values

    Returns
    -------
    bin indices
    """
    # Turn values into np array or sklearn will complain
    np_values = np.array(values).reshape(-1, 1)

    # Equal frequency binning (i.e. quantile binning)
    binner = KBinsDiscretizer(n_bins=num_bins,
                              encode="ordinal",
                              strategy="quantile")
    binner.fit(np_values)

    # Bin that each training pair belongs to
    bin_idx = binner.transform(np_values).reshape(-1)

    # Debug
    _log.debug(f"Split heuristic values into {num_bins} bins")
    _log.debug(f"Bin Edges: {binner.bin_edges_}")
    _log.debug(f"Bin Frequencies: {Counter(bin_idx.tolist())}")
    _log.debug(f"Heuristic Frequencies: {Counter(values)}")
    return bin_idx
예제 #24
0
class DiscretizeTransformer(Transformer):
    """Discretize continuous columns into several bins.

    Attributes:
        meta
        column_index
        discretizer(sklearn.preprocessing.KBinsDiscretizer)

    Transformation result is a int array.

    """
    def __init__(self, n_bins):
        self.n_bins = n_bins
        self.meta = None
        self.column_index = None
        self.discretizer = None

    def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.meta = self.get_metadata(data, categorical_columns,
                                      ordinal_columns)
        self.column_index = [
            index for index, info in enumerate(self.meta)
            if info['type'] == CONTINUOUS
        ]

        self.discretizer = KBinsDiscretizer(n_bins=self.n_bins,
                                            encode='ordinal',
                                            strategy='uniform')

        if not self.column_index:
            return

        self.discretizer.fit(data[:, self.column_index])

    def transform(self, data):
        """Transform data discretizing continous values.

        Args:
            data(pandas.DataFrame)

        Returns:
            numpy.ndarray

        """
        if self.column_index == []:
            return data.astype('int')

        data[:, self.column_index] = self.discretizer.transform(
            data[:, self.column_index])
        return data.astype('int')

    def inverse_transform(self, data):
        if self.column_index == []:
            return data

        data = data.astype('float32')
        data[:, self.column_index] = self.discretizer.inverse_transform(
            data[:, self.column_index])
        return data
예제 #25
0
파일: affect.py 프로젝트: svrijenhoek/dart
class Affect:
    """
    Class that calculates the average Affect score based on absolute sentiment polarity values.
    This approach is an initial approximation of the concept, and should be refined in the future.
    Should also implement polarity analysis at index time.
    """
    def __init__(self, config):
        n_bins = 5
        self.bins_discretizer = KBinsDiscretizer(encode='ordinal',
                                                 n_bins=n_bins,
                                                 strategy='uniform')
        warnings.filterwarnings("ignore", category=UserWarning)

    def compute_distr(self, arr, bins_discretizer, adjusted=False):
        """
            Args:
            Return"
        """
        n = len(arr)
        sum_one_over_ranks = harmonic_number(n)
        arr_binned = bins_discretizer.transform(arr)
        distr = {}
        if adjusted:
            for bin in list(range(bins_discretizer.n_bins)):
                for indx, ele in enumerate(arr_binned[:, 0]):
                    if ele == bin:
                        rank = indx + 1
                        bin_freq = distr.get(bin, 0.)
                        distr[
                            bin] = bin_freq + 1 * 1 / rank / sum_one_over_ranks

        else:
            for bin in list(range(bins_discretizer.n_bins)):
                distr[bin] = round(
                    np.count_nonzero(arr_binned == bin) / arr_binned.shape[0],
                    3)
        return distr

    def calculate(self, pool, recommendation):
        pool_affect = np.array(pool.sentiment.apply(lambda x: abs(x))).reshape(
            -1, 1)
        recommendation_affect = np.array(
            recommendation.sentiment.apply(lambda x: abs(x))).reshape(-1, 1)
        # arr_pool = np.array([abs(item.sentiment) for item in pool]).reshape(-1, 1)
        # arr_recommendation = np.array([abs(item.sentiment) for item in recommendation]).reshape(-1, 1)

        self.bins_discretizer.fit(pool_affect)
        distr_pool = self.compute_distr(pool_affect, self.bins_discretizer,
                                        False)
        distr_recommendation = self.compute_distr(recommendation_affect,
                                                  self.bins_discretizer, True)
        divergence_with_discount = compute_kl_divergence(
            distr_pool, distr_recommendation)

        distr_recommendation = self.compute_distr(recommendation_affect,
                                                  self.bins_discretizer, False)
        divergence_without_discount = compute_kl_divergence(
            distr_pool, distr_recommendation)
        return [divergence_with_discount, divergence_without_discount]
예제 #26
0
class KBinsDiscretizerComponent(AutoSklearnPreprocessingAlgorithm):
    def __init__(self,
                 n_bins: int = 5,
                 encode: str = "onehot",
                 strategy: str = "quantile",
                 random_state=None):
        super().__init__()
        self.n_bins = n_bins
        self.encode = encode
        self.strategy = strategy
        self.random_state = random_state

    def fit(self, X, Y=None):
        from sklearn.preprocessing import KBinsDiscretizer
        n_bins = int(self.n_bins)

        self.preprocessor = KBinsDiscretizer(n_bins=n_bins,
                                             encode=self.encode,
                                             strategy=self.strategy)
        self.preprocessor.fit(X, Y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'KBinsDiscretizer',
            'name': 'K-Bins Discretizer',
            'handles_regression': True,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': True,
            'is_deterministic': True,
            'input': (DENSE, UNSIGNED_DATA),
            'output': (INPUT, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_bins = UniformIntegerHyperparameter("n_bins",
                                              2,
                                              100,
                                              default_value=5)
        encode = CategoricalHyperparameter(
            "encode", ["onehot", "onehot-dense", "ordinal"],
            default_value="onehot")
        strategy = CategoricalHyperparameter("strategy",
                                             ["uniform", "quantile", "kmeans"],
                                             default_value="quantile")

        cs.add_hyperparameters([n_bins, encode, strategy])
        return cs
예제 #27
0
def generate_discretizer(pageviews):
    sorted_pageview_values = sorted(list(pageviews.values()))
    view_numbers = np.array(sorted_pageview_values).reshape(-1, 1)
    discretizer = KBinsDiscretizer(encode='ordinal',
                                   n_bins=number_bins(),
                                   strategy='kmeans')
    discretizer.fit(view_numbers)
    return [discretizer, view_numbers]
예제 #28
0
def test_transform_1d_behavior():
    X = np.arange(4)
    est = KBinsDiscretizer(n_bins=2)
    assert_raises(ValueError, est.fit, X)

    est = KBinsDiscretizer(n_bins=2)
    est.fit(X.reshape(-1, 1))
    assert_raises(ValueError, est.transform, X)
예제 #29
0
def discretizer(_, __, angle, pole_velocity) -> Tuple[int, ...]:
    '''
        Convert continuous states into a discrete state for Q-Learning
    '''
    # Utilize Scikit-learns KBINsDiscretizer
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))
예제 #30
0
def test_transform_1d_behavior():
    X = np.arange(4)
    est = KBinsDiscretizer(n_bins=2)
    assert_raises(ValueError, est.fit, X)

    est = KBinsDiscretizer(n_bins=2)
    est.fit(X.reshape(-1, 1))
    assert_raises(ValueError, est.transform, X)
예제 #31
0
def q2():
    discretizar = KBinsDiscretizer(n_bins=10,
                                   encode='ordinal',
                                   strategy='quantile')
    discretizar.fit(countries[['Pop_density']])
    resposta = discretizar.transform(countries[['Pop_density']])
    resposta = sum(resposta[:, 0] == 9)
    return int(resposta)
예제 #32
0
def test_transform_outside_fit_range(strategy):
    X = np.array([0, 1, 2, 3])[:, None]
    kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
    kbd.fit(X)

    X2 = np.array([-2, 5])[:, None]
    X2t = kbd.transform(X2)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.min(axis=0), [0])
예제 #33
0
def test_fit_transform(strategy, expected):
    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
    est.fit(X)
    assert_array_equal(expected, est.transform(X))
    xx, yy = np.meshgrid(
        np.linspace(X[:, 0].min(), X[:, 0].max(), 300),
        np.linspace(X[:, 1].min(), X[:, 1].max(), 300))
    grid = np.c_[xx.ravel(), yy.ravel()]

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

    i += 1
    # transform the dataset with KBinsDiscretizer
    for strategy in strategies:
        enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy)
        enc.fit(X)
        grid_encoded = enc.transform(grid)

        ax = plt.subplot(len(X_list), len(strategies) + 1, i)

        # horizontal stripes
        horizontal = grid_encoded[:, 0].reshape(xx.shape)
        ax.contourf(xx, yy, horizontal, alpha=.5)
        # vertical stripes
        vertical = grid_encoded[:, 1].reshape(xx.shape)
        ax.contourf(xx, yy, vertical, alpha=.5)

        ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())