예제 #1
0
def bayesian_rule_list(X_train, y_train, X_test, y_test):
    from mdlp.discretization import MDLP
    from sklearn import preprocessing

    # First one hot encode
    X_train, X_test = apply_one_hot_encoding(X_train, X_test)


    # Then need to convert classes to integers
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)

    # Then discretize features
    transformer = MDLP()
    X_train = transformer.fit_transform(X_train, y_train)
    X_test = transformer.transform(X_test)

    brl = pysbrl.BayesianRuleList()
    brl.fit(X_train, y_train)

    print(brl)

    # The complexity is the number of split points + the number of extra conditions
    # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules
    brl_str = str(brl)
    brl_complexity = brl_str.count("IF") + brl_str.count("AND")

    training_recreations = brl.predict(X_train)
    brl_training_recreating_pct = scorer(training_recreations, y_train) * 100
    testing_recreations = brl.predict(X_test)
    brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100

    return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
예제 #2
0
class MDLPDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
        if(labels is None):
            raise ValueError('Labels must be not None when using \
                             MDLPDiscretizer')
        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        self.transformer = MDLP()
        discretize_data = self.transformer.fit_transform(data, labels)
        bins = []
        for i in range (len(set(labels))):
            intervals = set(self.transformer.cat2intervals(discretize_data, i))
            feature_interval = []
            for i in range (len(intervals)):
                interval = intervals.pop()
                feature_interval.append(interval[0])
                feature_interval.append(interval[1])
            feature_interval = set(feature_interval)
            feature_interval.discard(float('inf'))
            feature_interval.discard(float('-inf'))
            array = [x for x in feature_interval]
            bins.append(np.array(array))
        return bins
예제 #3
0
def all_entropies(df1, total):
    mdlp = MDLP()
    x_test = mdlp.fit_transform(df1[df1.columns[:-1]].values,
                                df1[df1.columns[-1]].values)
    entropies = []
    for x, y in enumerate(mdlp.cut_points_):
        if len(y) > 1:
            for j, k in enumerate(y):
                if j == 0:
                    temp = df1[df1[x] <= y[j]]['class'].value_counts(
                    ).values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))
                if j == len(y) - 1:
                    temp = df1[
                        df1[x] > y[j]]['class'].value_counts().values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))

                if j != len(y) - 1:
                    temp = df1[(df1[x] > y[j])
                               & (df1[x] <= y[j + 1])]['class'].value_counts(
                               ).values.tolist()
                    if len(temp) > 1:
                        entropies.append(cal_entropy(temp, total))
                    else:
                        temp.append(0)
                        entropies.append(cal_entropy(temp, total))

        if len(y) == 1:
            temp = df1[df1[x] <= y[0]]['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))
            temp = df1[df1[x] > y[0]]['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))

        if len(y) == 0:
            temp = df1['class'].value_counts().values.tolist()
            if len(temp) > 1:
                entropies.append(cal_entropy(temp, total))
            else:
                temp.append(0)
                entropies.append(cal_entropy(temp, total))

    return sorted(entropies)
예제 #4
0
def discMdlp(_df):
    featureVals = [x for x in _df if x != 'Class']
    transformer = MDLP()
    discretizedMap = {'Class': _df['Class']}

    discret = transformer.fit_transform(_df[featureVals], _df['Class'])
    nFrame = pd.DataFrame(data=discret, columns=featureVals)
    nFrame.loc[:, 'Class'] = pd.Series(_df['Class'])

    return nFrame
예제 #5
0
def test_fit_transform_translate():
    expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)

    X = np.arange(9, dtype=float).reshape(-1, 1)
    y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
    transformed = MDLP(shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, transformed)

    # translating data does not affect discretization result
    translated = MDLP(shuffle=False).fit_transform(X - 5, y)
    assert_array_equal(expected, translated)
예제 #6
0
def classifier(args):
    dataset_info = datasets_info[args.data_type]

    df = pd.read_csv(dataset_info['path'])
    for drop_col in dataset_info['drop_columns']:
        df = df.drop(columns=df.columns[drop_col])
    y = df[df.columns[dataset_info['class_column']]]
    X = df.drop(columns=df.columns[dataset_info['class_column']])

    if args.plot:
        sns.pairplot(df, hue=df.columns[dataset_info['class_column']])
        plt.show()

    # Discretize values before training
    if args.discretization_bins > 0:
        if args.discretization_mode == DISC_MDLP:
            transformer = MDLP()
            X = transformer.fit_transform(X, y)
        else:
            for column in X:
                bins = discretization(args.discretization_mode, X[column],
                                      args.discretization_bins)
                X[column] = bins

    # Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=42)

    # Create a new figure and set the figsize argument so we get square-ish plots of the 4 features.
    if args.plot:
        plt.figure(figsize=(10, 3))

    # Iterate over the features, creating a subplot with a histogram for each one.
    if args.plot:
        for feature in range(X_train.shape[1]):
            plt.subplot(1, len(X_train.columns), feature + 1)
            sns.distplot(X_train.values[:, feature])
        plt.show()

    # Fitting Naive Bayes Classification to the Training set
    # classifier = GaussianNB()
    classifier = MultinomialNB(alpha=1.0)
    classifier.fit(X_train, y_train)

    cross_validation(classifier, X, y)

    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    evaluation(y_test, y_pred, args)
예제 #7
0
def get_discretizer(x,
                    y,
                    continuous_features=None,
                    seed=None,
                    min_depth=0) -> MDLP:
    discretizer = MDLP(random_state=seed, min_depth=min_depth)
    if continuous_features is not None:
        if continuous_features.dtype == np.bool:
            continuous_features = np.arange(
                len(continuous_features))[continuous_features]
    discretizer.fit(x, y, continuous_features)
    return discretizer
예제 #8
0
def test_coerce_list():
    expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)

    X = [[i] for i in range(9)]
    y = [0, 0, 0, 0, 1, 0, 1, 1, 1]
    transformed = MDLP(shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, transformed)

    np_X = np.arange(9).reshape(-1, 1)
    np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
    np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y)
    assert_array_equal(expected, np_transformed)
예제 #9
0
def test_sparse_input():
    expected = [
        [0, 0],
        [0, 0],
        [1, 0],
        [2, 0],
    ]

    dense_X = np.array([[0.1, 0.1], [0.2, 0.4], [0.3, 0.2], [0.4, 0.3]])
    X = scipy.sparse.csr_matrix(dense_X)
    y = np.array([0, 0, 1, 2])
    disc = MDLP(shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, disc.toarray())
예제 #10
0
def test_drop_collapsed_features_sparse():
    expected = [
        [0, 0],
        [0, 0],
        [1, 1],
        [2, 2],
    ]

    dense_X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4],
                        [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]])
    X = scipy.sparse.csr_matrix(dense_X)
    y = np.array([0, 0, 1, 2])
    disc = MDLP(drop_collapsed_features=True,
                shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, disc.toarray())
예제 #11
0
def test_BayesianRuleList2():
    dataset = load_breast_cancer()
    x, y = dataset['data'], dataset['target']
    feature_names = dataset['feature_names']
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33, random_state=42)
    discretizer = MDLP(random_state=42).fit(x_train, y_train)
    x_train_cat = discretizer.transform(x_train)
    category_names = compute_intervals(discretizer)
    rule_list = BayesianRuleList(seed=1, feature_names=feature_names, category_names=category_names, verbose=2)
    rule_list.fit(x_train_cat, y_train)
    print(rule_list)
    x_test_cat = discretizer.transform(x_test)

    print('acc: %.4f' % rule_list.score(x_test_cat, y_test))
예제 #12
0
    def __init__(
        self,
        mdlp_args: Dict[str, Any] = None,
    ):
        """
        This method of discretisation applies MDLP to discretise the data

        Args:
            min_depth: The minimum depth of the interval splitting.
            min_split: The minmum size to split a bin
            dtype: The type of the array returned by the `transform()` method
            **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP`
        Raises:
            ImportError: if mdlp-discretization is not installed successfully
        """
        super().__init__()
        mdlp_args = mdlp_args or {"min_depth": 0, "min_split": 1e-3, "dtype": int}
        self.mdlp_args = mdlp_args
        self.feat_names = None
        self.map_feat_transformer = {}
        if MDLP is None:
            raise ImportError(
                "mdlp-discretisation was not installed and imported successfully"
            )
        self.mdlp = MDLP(**mdlp_args)
예제 #13
0
def discretizer2json(discretizer: MDLP, data=None) -> List[dict]:
    cut_points = discretizer.cut_points_  # type: list
    category_intervals = [None] * len(cut_points)
    cut_points = [
        None if cut_point is None else cut_point for cut_point in cut_points
    ]
    maxs = discretizer.maxs_
    mins = discretizer.mins_
    # print(cut_points)
    for i, _cut_points in enumerate(cut_points):
        if _cut_points is None:
            continue
        cats = np.arange(len(_cut_points) + 1)
        intervals = [[
            None if low == -inf else low, None if high == inf else high
        ] for low, high in discretizer.cat2intervals(cats, i)]
        category_intervals[i] = intervals

    return [
        {
            'cutPoints': cut_points[i],
            'intervals': category_intervals[i],
            'max': maxs[i],
            'min': mins[i],
            # 'ratios': category_ratios[i]
        } for i in range(len(cut_points))
    ]
예제 #14
0
 def bins(self, data, labels):
     self.transformer = MDLP()
     discretize_data = self.transformer.fit_transform(data, labels)
     bins = []
     for i in range (len(set(labels))):
         intervals = set(self.transformer.cat2intervals(discretize_data, i))
         feature_interval = []
         for i in range (len(intervals)):
             interval = intervals.pop()
             feature_interval.append(interval[0])
             feature_interval.append(interval[1])
         feature_interval = set(feature_interval)
         feature_interval.discard(float('inf'))
         feature_interval.discard(float('-inf'))
         array = [x for x in feature_interval]
         bins.append(np.array(array))
     return bins
class SupervisedDiscretizationStrategy(object):
    """
        A class used for supervised data discretization.
    """
    def __init__(self):
        self.transformer = MDLP()

    def discretize(self, data_set, validation_size, nb_bins=None):
        """ Discretize continuous attribute using MDLP method.

        Args:
            data_set: The data set containing continuous data.
            validation_size: The validation size of the newly created discretized data set.

        Returns:
            discretized_dataset: A DataSet object containing discretized data.
        """

        # Create strategy object to further create the discretized data set.
        galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy()

        # Get data from training set.
        X_train = data_set.train.get_features
        y_train = data_set.train.get_labels

        # Supervised discretization of the training data set using MDLP.
        X_train_discretized = self.transformer.fit_transform(X=X_train,
                                                             y=y_train)

        # Get data from validation set.
        X_valid = data_set.valid.get_features
        y_valid = data_set.valid.get_labels

        # Unsupervised discretization using MDLP.
        X_valid_discretized = self.transformer.transform(X=X_valid)

        # Merge both training and validation data.
        X = np.append(X_train_discretized, X_valid_discretized, axis=0)
        y = np.append(y_train, y_valid, axis=0)

        # Create a new data set.
        discretized_dataset = galaxy_dataset_feature_strategy.create_datasets(
            X, y, validation_size)

        return discretized_dataset
예제 #16
0
def num2cate_fit(df, min=2):
    '''
    Arg
        df (Panda dataframes); the last col must be class, int 0 or 1
        min (int): The minimum depth of the interval splitting. Overrides
        the MDLP stopping criterion. If the entropy at a given interval
        is found to be zero before `min_depth`, the algorithm will stop.
    Return
        mdlp (MDLP instance): transform, can be used to transform samples
    '''
    Y = df.iloc[:, -1].values
    continuous_features =df.iloc[:, :-1].select_dtypes(include=['int64','float64']).columns.tolist()
    continuous_features.sort() # ensoure the features order between fit and transform
    X = df[continuous_features].values
    mdlp = MDLP(min_depth=min)
    mdlp.fit(X, Y) # X, Y should be numpy array

    return mdlp
예제 #17
0
def test_fit_transform_scale():
    expected = [
        [0, 0],
        [0, 0],
        [1, 0],
        [2, 0],
    ]

    X = np.array([[0.1, 0.1], [0.2, 0.4], [0.3, 0.2], [0.4, 0.3]])
    y = np.array([0, 0, 1, 2])
    for i in range(10):
        scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y)
        assert_array_equal(expected, scaled_disc)
예제 #18
0
def test_drop_collapsed_features_dense():
    expected = [
        [0, 0],
        [0, 0],
        [1, 1],
        [2, 2],
    ]

    X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4],
                  [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]])
    y = np.array([0, 0, 1, 2])
    disc = MDLP(drop_collapsed_features=True,
                shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, disc)
예제 #19
0
def test_multiprocessing():
    """Only tests that the functionality is not affected, not that parallel
       processing actually takes place.
    """
    expected = [
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 1, 0, 1, 0],
        [0, 2, 0, 2, 0],
    ]

    X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4],
                  [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]])
    y = np.array([0, 0, 1, 2])
    disc = MDLP(n_jobs=3, shuffle=False).fit_transform(X, y)
    assert_array_equal(expected, disc)
예제 #20
0
def get_category_ratios(
        data,
        discretizer: MDLP,
        categories: List[List[str]] = None) -> List[List[float]]:
    continuous = set(discretizer.continuous_features)
    ratios = []
    for idx in range(data.shape[1]):
        # print(idx)
        col = data[:, idx]
        if idx in continuous:
            cats = discretizer.cts2cat(col, idx)
            unique_cats, _counts = np.unique(cats, return_counts=True)
            n_cats = len(discretizer.cut_points_[idx]) + 1
        else:
            unique_cats, _counts = np.unique(col.astype(np.int),
                                             return_counts=True)
            n_cats = len(categories[idx]) if categories is not None else (
                max(unique_cats) + 1)
        counts = np.zeros(shape=(n_cats, ))
        counts[unique_cats] = _counts
        ratios.append(counts / len(col))
    return ratios
예제 #21
0
    def grow(self, data, t_id, level, cur_performance):
        """
        :param data: current data for future tree growth
        :param t_id: tree id
        :param level: level id
        :return: None
        """
        if level >= self.max_depth:
            return
        if len(data) == 0:
            print "?????????????????????? Early Ends ???????????????????????"
            return
        self.tree_depths[t_id] = level
        decision = self.structures[t_id][level]
        structure = tuple(self.structures[t_id][:level + 1])
        cur_selected = self.computed_cache.get(structure, None)
        Y = data.as_matrix(columns=[self.target])
        if not cur_selected:
            for cue in list(data):
                if cue in self.ignore or cue == self.target:
                    continue
                if self.split_method == "MDLP":
                    mdlp = MDLP()
                    X = data.as_matrix(columns=[cue])
                    X_disc = mdlp.fit_transform(X, Y)
                    X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0))
                    bins = np.unique(X_disc, axis=0)
                    if len(
                            bins
                    ) <= 1:  # MDLP return the whole range as one bin, use median instead.
                        threshold = data[cue].median()
                        for direction in "><":
                            cur_selected = self.eval_point_split(
                                level, cur_selected, cur_performance, data,
                                cue, direction, threshold, decision)
                        continue
                    # print ", ".join([cue, str(bins)+" bins"])
                    for bin in bins:
                        indexes = np.where(X_disc == bin)[0]
                        interval = X_interval[indexes]
                        try:
                            if len(np.unique(interval, axis=0)) != 1:
                                print "???????????????????????????????????????????????????"
                        except:
                            print 'ha'
                        interval = interval[0]
                        if interval[0] == float('-inf'):
                            threshold = interval[1]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        elif interval[1] == float('inf'):
                            threshold = interval[0]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        else:
                            cur_selected = self.eval_range_split(
                                level, cur_selected, cur_performance, data,
                                cue, indexes, interval, decision)
                    continue
                elif self.split_method == "percentile":
                    thresholds = set(data[cue].quantile(
                        [x / 20.0 for x in range(1, 20)],
                        interpolation='midpoint'))
                else:
                    thresholds = [data[cue].median()]
                # point split, e.g. median or x% percentiles.
                for threshold in thresholds:
                    for direction in "><":
                        cur_selected = self.eval_point_split(
                            level, cur_selected, cur_performance, data, cue,
                            direction, threshold, decision)

            self.computed_cache[structure] = cur_selected
        self.selected[t_id][level] = cur_selected['rule']
        self.performance_on_train[t_id][level] = cur_selected[
            'metrics'] + get_performance(cur_selected['metrics'])
        self.grow(cur_selected['undecided'], t_id, level + 1,
                  cur_selected['metrics'])
예제 #22
0
 def get_raw_bins(column, target):
     transformer = MDLP()
     transformer = transformer.fit(column, target)
     return list(transformer.cut_points_[0])
예제 #23
0
continuous_features_list = []
for each in file_pmeta:
    if (each[0] == ':') or (
            each == '\n'
    ):  # and (each[1] == ':'):   # deal with the ':', and '\n' in the file
        continue
    else:
        attr_count += 1
        attr_temp_name, attr_temp_val = each.split(':', 1)  # hair:0,1,2
        each_attrVal_array.append(attr_temp_val.strip())
        if attr_temp_val.strip() == "numeric" or attr_temp_val.strip(
        ) == "real":  # record the continuous_features
            continuous_features_list.append(attr_count)
        attrname_list.append(attr_temp_name.strip())
continuous_features = np.array(continuous_features_list)
discretizationer = MDLP(continuous_features)  # create a discretizationer

attrnum_list_temp = []
temp = []
for each in each_attrVal_array:
    temp.append(list(map(str, each.strip().split(','))))

each_attrVal_array = temp
print each_attrVal_array

# 处理后: each_attrVal_array
# [['vhigh', 'high', 'med', 'low'], ['vhigh', 'high', 'med', 'low'], ['2', '3', '4', '5more'],
# ['2', '4', 'more'], ['small', 'med', 'big'], ['low', 'med', 'high'], ['unacc', 'acc', 'good', 'vgood']]
## transform all the insts in filename.pdata from 'str' to 'int'

f_pdata = loaddata.openfile(filePath + Global_V.TESTFILE + '.pdata')
예제 #24
0
from mdlp.discretization import MDLP

train_raw = pd.read_csv("input/train.csv")
test_raw = pd.read_csv("input/test.csv")

# drop NaNs, use only the Age feature itself to estimate bins
train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0)
survived = train_sur_age['Survived'].values
age = (train_sur_age['Age'].values).reshape(-1, 1)

n_bins = []
age_lim = []
n = 1000
for i in range(n):
    transformer = MDLP(random_state=i, continuous_features=None)
    age_dis = transformer.fit_transform(age, survived)
    age_bins = transformer.cat2intervals(age_dis, 0)
    n_bins.append(len(set(age_bins)))
    if len(set(age_bins)) == 2: age_lim.append(age_bins[0])
    elif len(set(age_bins)) > 2:
        print('\t ! more than two bins, n=', len(set(age_bins)))

print('* estimated N bins:', set(n_bins))
print('\t mean', np.mean(1. * np.array(n_bins)))
print('* Age thresholds, frequencies')
lim_val = np.array(age_lim)[:, 0]

sum_not_inf = 0
for val_i in set(lim_val):
    print('\t', val_i, (1. * sum(lim_val == val_i)) / n)
                      classes=['Smooth', 'Spiral'],
                      title='Confusion matrix, without normalization with k:3')
plt.figure()
plot_confusion_matrix(cm_galaxy_test_k3u,
                      classes=['Smooth', 'Spiral'],
                      normalize=True,
                      title='Confusion matrix, with normalization with k:3')
plt.show()
# In[33]:
# In[33]:
print("       Bayes Naif models with hold-out set ")

# Scale data for train, validation (hold out), and test
# first method of discretization using MDLP
from mdlp.discretization import MDLP
mdlp = MDLP()
Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy)
Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy)
Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy)

# In[33]:
# Second method of discretization using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy)
Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy)
Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy)
# In[33]:

# Bayes naïf gaussien with 2 different parameters i.e.
# 1. priors = probaility of each class
 def __init__(self):
     self.transformer = MDLP()
X_train_galaxy, X_valid_galaxy, Y_train_galaxy, Y_valid_galaxy = train_test_split(
    X_data_galaxy,
    Y_data_galaxy,
    test_size=0.4,
    random_state=0,
    shuffle=True,
    stratify=Y_data_galaxy)
X_test_galaxy, X_valid_galaxy, Y_test_galaxy, Y_valid_galaxy = train_test_split(
    X_valid_galaxy,
    Y_valid_galaxy,
    test_size=0.5,
    random_state=0,
    shuffle=True,
    stratify=Y_valid_galaxy)
# In[30]:

from sklearn.metrics import accuracy_score

# =============================================================================
# from sklearn.preprocessing import StandardScaler
# Xtrain_galaxy_s=X_train_galaxy
# Xtest_galaxy_s=X_test_galaxy
# Xvalid_galaxy_s=X_valid_galaxy
# =============================================================================
from mdlp.discretization import MDLP
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
mdlp = MDLP()
conv_X = mdlp.fit_transform(X, y)
def get_discretizer(method='mdlp', *args, **kwargs):
    if method == 'mdlp':
        return MDLP(*args, **kwargs)
    else:
        raise ValueError("Not supporting method %s" % method)
예제 #29
0
#coding=utf-8
import numpy as np
from mdlp.discretization import MDLP
from sklearn.datasets import load_iris
column = np.array([1,2])
transformer = MDLP(column)
iris = load_iris()
X, y = iris.data, iris.target
print y
print type(X), type(y)
X_disc = transformer.fit_transform(X,y)

conv_X = transformer.fit_transform(X, y)
print conv_X
di = transformer.cut_points_




print  transformer.cut_points_

for each in di:
    print len(di[each])
예제 #30
0
df['Age'] = df['Age'].fillna(age_mean)
df['Embarked'] = df['Embarked'].fillna(embark_mode)
df['Cabin'] = df['Cabin'].fillna("U")

df['Title'] = df['Name'].map(lambda x: substring_exist(x, TITLE_LIST))

df['Title'] = df.apply(replace_titles, axis=1)
df['Embarked'] = df.apply(replace_embark, axis=1)
df['Deck'] = df['Cabin'].map(lambda x: substring_exist(x, CABIN_LIST))
df['Deck'] = df.apply(replace_deck, axis=1)

df['Family_Size'] = df['SibSp'] + df['Parch']

df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1)

transformer = MDLP()
X_age = df["Age"].to_numpy().reshape((df["Age"].shape[0], 1))
y_age = df["Survived"].to_numpy().reshape((df["Survived"].shape[0], 1))

disc = transformer.fit_transform(X_age, y_age)
df["Age_disc"] = disc

df['Sex'] = df.apply(replace_sex, axis=1)

df["Pclass"] = df["Pclass"].map(lambda x: 1 / x)

df.to_csv("./data/train_neat.csv")

print(df.columns)