def bayesian_rule_list(X_train, y_train, X_test, y_test): from mdlp.discretization import MDLP from sklearn import preprocessing # First one hot encode X_train, X_test = apply_one_hot_encoding(X_train, X_test) # Then need to convert classes to integers encoder = preprocessing.LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.transform(y_test) # Then discretize features transformer = MDLP() X_train = transformer.fit_transform(X_train, y_train) X_test = transformer.transform(X_test) brl = pysbrl.BayesianRuleList() brl.fit(X_train, y_train) print(brl) # The complexity is the number of split points + the number of extra conditions # (i.e. if x1 > 0 and x2 = 1 then .. counts as 2 not 1), for this reason we do not use brl.n_rules brl_str = str(brl) brl_complexity = brl_str.count("IF") + brl_str.count("AND") training_recreations = brl.predict(X_train) brl_training_recreating_pct = scorer(training_recreations, y_train) * 100 testing_recreations = brl.predict(X_test) brl_testing_recreating_pct = scorer(testing_recreations, y_test) * 100 return brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity
class MDLPDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): if(labels is None): raise ValueError('Labels must be not None when using \ MDLPDiscretizer') BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): self.transformer = MDLP() discretize_data = self.transformer.fit_transform(data, labels) bins = [] for i in range (len(set(labels))): intervals = set(self.transformer.cat2intervals(discretize_data, i)) feature_interval = [] for i in range (len(intervals)): interval = intervals.pop() feature_interval.append(interval[0]) feature_interval.append(interval[1]) feature_interval = set(feature_interval) feature_interval.discard(float('inf')) feature_interval.discard(float('-inf')) array = [x for x in feature_interval] bins.append(np.array(array)) return bins
def all_entropies(df1, total): mdlp = MDLP() x_test = mdlp.fit_transform(df1[df1.columns[:-1]].values, df1[df1.columns[-1]].values) entropies = [] for x, y in enumerate(mdlp.cut_points_): if len(y) > 1: for j, k in enumerate(y): if j == 0: temp = df1[df1[x] <= y[j]]['class'].value_counts( ).values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if j == len(y) - 1: temp = df1[ df1[x] > y[j]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if j != len(y) - 1: temp = df1[(df1[x] > y[j]) & (df1[x] <= y[j + 1])]['class'].value_counts( ).values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if len(y) == 1: temp = df1[df1[x] <= y[0]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) temp = df1[df1[x] > y[0]]['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) if len(y) == 0: temp = df1['class'].value_counts().values.tolist() if len(temp) > 1: entropies.append(cal_entropy(temp, total)) else: temp.append(0) entropies.append(cal_entropy(temp, total)) return sorted(entropies)
def discMdlp(_df): featureVals = [x for x in _df if x != 'Class'] transformer = MDLP() discretizedMap = {'Class': _df['Class']} discret = transformer.fit_transform(_df[featureVals], _df['Class']) nFrame = pd.DataFrame(data=discret, columns=featureVals) nFrame.loc[:, 'Class'] = pd.Series(_df['Class']) return nFrame
def test_fit_transform_translate(): expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) X = np.arange(9, dtype=float).reshape(-1, 1) y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) transformed = MDLP(shuffle=False).fit_transform(X, y) assert_array_equal(expected, transformed) # translating data does not affect discretization result translated = MDLP(shuffle=False).fit_transform(X - 5, y) assert_array_equal(expected, translated)
def classifier(args): dataset_info = datasets_info[args.data_type] df = pd.read_csv(dataset_info['path']) for drop_col in dataset_info['drop_columns']: df = df.drop(columns=df.columns[drop_col]) y = df[df.columns[dataset_info['class_column']]] X = df.drop(columns=df.columns[dataset_info['class_column']]) if args.plot: sns.pairplot(df, hue=df.columns[dataset_info['class_column']]) plt.show() # Discretize values before training if args.discretization_bins > 0: if args.discretization_mode == DISC_MDLP: transformer = MDLP() X = transformer.fit_transform(X, y) else: for column in X: bins = discretization(args.discretization_mode, X[column], args.discretization_bins) X[column] = bins # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # Create a new figure and set the figsize argument so we get square-ish plots of the 4 features. if args.plot: plt.figure(figsize=(10, 3)) # Iterate over the features, creating a subplot with a histogram for each one. if args.plot: for feature in range(X_train.shape[1]): plt.subplot(1, len(X_train.columns), feature + 1) sns.distplot(X_train.values[:, feature]) plt.show() # Fitting Naive Bayes Classification to the Training set # classifier = GaussianNB() classifier = MultinomialNB(alpha=1.0) classifier.fit(X_train, y_train) cross_validation(classifier, X, y) # Predicting the Test set results y_pred = classifier.predict(X_test) print(y_pred) evaluation(y_test, y_pred, args)
def get_discretizer(x, y, continuous_features=None, seed=None, min_depth=0) -> MDLP: discretizer = MDLP(random_state=seed, min_depth=min_depth) if continuous_features is not None: if continuous_features.dtype == np.bool: continuous_features = np.arange( len(continuous_features))[continuous_features] discretizer.fit(x, y, continuous_features) return discretizer
def test_coerce_list(): expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) X = [[i] for i in range(9)] y = [0, 0, 0, 0, 1, 0, 1, 1, 1] transformed = MDLP(shuffle=False).fit_transform(X, y) assert_array_equal(expected, transformed) np_X = np.arange(9).reshape(-1, 1) np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y) assert_array_equal(expected, np_transformed)
def test_sparse_input(): expected = [ [0, 0], [0, 0], [1, 0], [2, 0], ] dense_X = np.array([[0.1, 0.1], [0.2, 0.4], [0.3, 0.2], [0.4, 0.3]]) X = scipy.sparse.csr_matrix(dense_X) y = np.array([0, 0, 1, 2]) disc = MDLP(shuffle=False).fit_transform(X, y) assert_array_equal(expected, disc.toarray())
def test_drop_collapsed_features_sparse(): expected = [ [0, 0], [0, 0], [1, 1], [2, 2], ] dense_X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4], [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]]) X = scipy.sparse.csr_matrix(dense_X) y = np.array([0, 0, 1, 2]) disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y) assert_array_equal(expected, disc.toarray())
def test_BayesianRuleList2(): dataset = load_breast_cancer() x, y = dataset['data'], dataset['target'] feature_names = dataset['feature_names'] x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42) discretizer = MDLP(random_state=42).fit(x_train, y_train) x_train_cat = discretizer.transform(x_train) category_names = compute_intervals(discretizer) rule_list = BayesianRuleList(seed=1, feature_names=feature_names, category_names=category_names, verbose=2) rule_list.fit(x_train_cat, y_train) print(rule_list) x_test_cat = discretizer.transform(x_test) print('acc: %.4f' % rule_list.score(x_test_cat, y_test))
def __init__( self, mdlp_args: Dict[str, Any] = None, ): """ This method of discretisation applies MDLP to discretise the data Args: min_depth: The minimum depth of the interval splitting. min_split: The minmum size to split a bin dtype: The type of the array returned by the `transform()` method **dlp_args: keyword arguments, which are parameters used for `mdlp.discretization.MDLP` Raises: ImportError: if mdlp-discretization is not installed successfully """ super().__init__() mdlp_args = mdlp_args or {"min_depth": 0, "min_split": 1e-3, "dtype": int} self.mdlp_args = mdlp_args self.feat_names = None self.map_feat_transformer = {} if MDLP is None: raise ImportError( "mdlp-discretisation was not installed and imported successfully" ) self.mdlp = MDLP(**mdlp_args)
def discretizer2json(discretizer: MDLP, data=None) -> List[dict]: cut_points = discretizer.cut_points_ # type: list category_intervals = [None] * len(cut_points) cut_points = [ None if cut_point is None else cut_point for cut_point in cut_points ] maxs = discretizer.maxs_ mins = discretizer.mins_ # print(cut_points) for i, _cut_points in enumerate(cut_points): if _cut_points is None: continue cats = np.arange(len(_cut_points) + 1) intervals = [[ None if low == -inf else low, None if high == inf else high ] for low, high in discretizer.cat2intervals(cats, i)] category_intervals[i] = intervals return [ { 'cutPoints': cut_points[i], 'intervals': category_intervals[i], 'max': maxs[i], 'min': mins[i], # 'ratios': category_ratios[i] } for i in range(len(cut_points)) ]
def bins(self, data, labels): self.transformer = MDLP() discretize_data = self.transformer.fit_transform(data, labels) bins = [] for i in range (len(set(labels))): intervals = set(self.transformer.cat2intervals(discretize_data, i)) feature_interval = [] for i in range (len(intervals)): interval = intervals.pop() feature_interval.append(interval[0]) feature_interval.append(interval[1]) feature_interval = set(feature_interval) feature_interval.discard(float('inf')) feature_interval.discard(float('-inf')) array = [x for x in feature_interval] bins.append(np.array(array)) return bins
class SupervisedDiscretizationStrategy(object): """ A class used for supervised data discretization. """ def __init__(self): self.transformer = MDLP() def discretize(self, data_set, validation_size, nb_bins=None): """ Discretize continuous attribute using MDLP method. Args: data_set: The data set containing continuous data. validation_size: The validation size of the newly created discretized data set. Returns: discretized_dataset: A DataSet object containing discretized data. """ # Create strategy object to further create the discretized data set. galaxy_dataset_feature_strategy = GalaxyDataSetFeatureStrategy() # Get data from training set. X_train = data_set.train.get_features y_train = data_set.train.get_labels # Supervised discretization of the training data set using MDLP. X_train_discretized = self.transformer.fit_transform(X=X_train, y=y_train) # Get data from validation set. X_valid = data_set.valid.get_features y_valid = data_set.valid.get_labels # Unsupervised discretization using MDLP. X_valid_discretized = self.transformer.transform(X=X_valid) # Merge both training and validation data. X = np.append(X_train_discretized, X_valid_discretized, axis=0) y = np.append(y_train, y_valid, axis=0) # Create a new data set. discretized_dataset = galaxy_dataset_feature_strategy.create_datasets( X, y, validation_size) return discretized_dataset
def num2cate_fit(df, min=2): ''' Arg df (Panda dataframes); the last col must be class, int 0 or 1 min (int): The minimum depth of the interval splitting. Overrides the MDLP stopping criterion. If the entropy at a given interval is found to be zero before `min_depth`, the algorithm will stop. Return mdlp (MDLP instance): transform, can be used to transform samples ''' Y = df.iloc[:, -1].values continuous_features =df.iloc[:, :-1].select_dtypes(include=['int64','float64']).columns.tolist() continuous_features.sort() # ensoure the features order between fit and transform X = df[continuous_features].values mdlp = MDLP(min_depth=min) mdlp.fit(X, Y) # X, Y should be numpy array return mdlp
def test_fit_transform_scale(): expected = [ [0, 0], [0, 0], [1, 0], [2, 0], ] X = np.array([[0.1, 0.1], [0.2, 0.4], [0.3, 0.2], [0.4, 0.3]]) y = np.array([0, 0, 1, 2]) for i in range(10): scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y) assert_array_equal(expected, scaled_disc)
def test_drop_collapsed_features_dense(): expected = [ [0, 0], [0, 0], [1, 1], [2, 2], ] X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4], [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]]) y = np.array([0, 0, 1, 2]) disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y) assert_array_equal(expected, disc)
def test_multiprocessing(): """Only tests that the functionality is not affected, not that parallel processing actually takes place. """ expected = [ [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 2, 0, 2, 0], ] X = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.4, 0.2, 0.4, 0.2, 0.4], [0.2, 0.3, 0.2, 0.3, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3]]) y = np.array([0, 0, 1, 2]) disc = MDLP(n_jobs=3, shuffle=False).fit_transform(X, y) assert_array_equal(expected, disc)
def get_category_ratios( data, discretizer: MDLP, categories: List[List[str]] = None) -> List[List[float]]: continuous = set(discretizer.continuous_features) ratios = [] for idx in range(data.shape[1]): # print(idx) col = data[:, idx] if idx in continuous: cats = discretizer.cts2cat(col, idx) unique_cats, _counts = np.unique(cats, return_counts=True) n_cats = len(discretizer.cut_points_[idx]) + 1 else: unique_cats, _counts = np.unique(col.astype(np.int), return_counts=True) n_cats = len(categories[idx]) if categories is not None else ( max(unique_cats) + 1) counts = np.zeros(shape=(n_cats, )) counts[unique_cats] = _counts ratios.append(counts / len(col)) return ratios
def grow(self, data, t_id, level, cur_performance): """ :param data: current data for future tree growth :param t_id: tree id :param level: level id :return: None """ if level >= self.max_depth: return if len(data) == 0: print "?????????????????????? Early Ends ???????????????????????" return self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) cur_selected = self.computed_cache.get(structure, None) Y = data.as_matrix(columns=[self.target]) if not cur_selected: for cue in list(data): if cue in self.ignore or cue == self.target: continue if self.split_method == "MDLP": mdlp = MDLP() X = data.as_matrix(columns=[cue]) X_disc = mdlp.fit_transform(X, Y) X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0)) bins = np.unique(X_disc, axis=0) if len( bins ) <= 1: # MDLP return the whole range as one bin, use median instead. threshold = data[cue].median() for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) continue # print ", ".join([cue, str(bins)+" bins"]) for bin in bins: indexes = np.where(X_disc == bin)[0] interval = X_interval[indexes] try: if len(np.unique(interval, axis=0)) != 1: print "???????????????????????????????????????????????????" except: print 'ha' interval = interval[0] if interval[0] == float('-inf'): threshold = interval[1] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) elif interval[1] == float('inf'): threshold = interval[0] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) else: cur_selected = self.eval_range_split( level, cur_selected, cur_performance, data, cue, indexes, interval, decision) continue elif self.split_method == "percentile": thresholds = set(data[cue].quantile( [x / 20.0 for x in range(1, 20)], interpolation='midpoint')) else: thresholds = [data[cue].median()] # point split, e.g. median or x% percentiles. for threshold in thresholds: for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) self.grow(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])
def get_raw_bins(column, target): transformer = MDLP() transformer = transformer.fit(column, target) return list(transformer.cut_points_[0])
continuous_features_list = [] for each in file_pmeta: if (each[0] == ':') or ( each == '\n' ): # and (each[1] == ':'): # deal with the ':', and '\n' in the file continue else: attr_count += 1 attr_temp_name, attr_temp_val = each.split(':', 1) # hair:0,1,2 each_attrVal_array.append(attr_temp_val.strip()) if attr_temp_val.strip() == "numeric" or attr_temp_val.strip( ) == "real": # record the continuous_features continuous_features_list.append(attr_count) attrname_list.append(attr_temp_name.strip()) continuous_features = np.array(continuous_features_list) discretizationer = MDLP(continuous_features) # create a discretizationer attrnum_list_temp = [] temp = [] for each in each_attrVal_array: temp.append(list(map(str, each.strip().split(',')))) each_attrVal_array = temp print each_attrVal_array # 处理后: each_attrVal_array # [['vhigh', 'high', 'med', 'low'], ['vhigh', 'high', 'med', 'low'], ['2', '3', '4', '5more'], # ['2', '4', 'more'], ['small', 'med', 'big'], ['low', 'med', 'high'], ['unacc', 'acc', 'good', 'vgood']] ## transform all the insts in filename.pdata from 'str' to 'int' f_pdata = loaddata.openfile(filePath + Global_V.TESTFILE + '.pdata')
from mdlp.discretization import MDLP train_raw = pd.read_csv("input/train.csv") test_raw = pd.read_csv("input/test.csv") # drop NaNs, use only the Age feature itself to estimate bins train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0) survived = train_sur_age['Survived'].values age = (train_sur_age['Age'].values).reshape(-1, 1) n_bins = [] age_lim = [] n = 1000 for i in range(n): transformer = MDLP(random_state=i, continuous_features=None) age_dis = transformer.fit_transform(age, survived) age_bins = transformer.cat2intervals(age_dis, 0) n_bins.append(len(set(age_bins))) if len(set(age_bins)) == 2: age_lim.append(age_bins[0]) elif len(set(age_bins)) > 2: print('\t ! more than two bins, n=', len(set(age_bins))) print('* estimated N bins:', set(n_bins)) print('\t mean', np.mean(1. * np.array(n_bins))) print('* Age thresholds, frequencies') lim_val = np.array(age_lim)[:, 0] sum_not_inf = 0 for val_i in set(lim_val): print('\t', val_i, (1. * sum(lim_val == val_i)) / n)
classes=['Smooth', 'Spiral'], title='Confusion matrix, without normalization with k:3') plt.figure() plot_confusion_matrix(cm_galaxy_test_k3u, classes=['Smooth', 'Spiral'], normalize=True, title='Confusion matrix, with normalization with k:3') plt.show() # In[33]: # In[33]: print(" Bayes Naif models with hold-out set ") # Scale data for train, validation (hold out), and test # first method of discretization using MDLP from mdlp.discretization import MDLP mdlp = MDLP() Xtrain_galaxy_MDLP = mdlp.fit_transform(X_train_galaxy, Y_train_galaxy) Xtest_galaxy_MDLP = mdlp.transform(X_test_galaxy, Y_test_galaxy) Xvalid_galaxy_MDLP = mdlp.transform(X_valid_galaxy, Y_valid_galaxy) # In[33]: # Second method of discretization using MinMaxScaler from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() Xtrain_galaxy_unsupervised = scaler.fit_transform(X_train_galaxy) Xtest_galaxy_unsupervised = scaler.transform(X_test_galaxy) Xvalid_galaxy_unsupervised = scaler.transform(X_valid_galaxy) # In[33]: # Bayes naïf gaussien with 2 different parameters i.e. # 1. priors = probaility of each class
def __init__(self): self.transformer = MDLP()
X_train_galaxy, X_valid_galaxy, Y_train_galaxy, Y_valid_galaxy = train_test_split( X_data_galaxy, Y_data_galaxy, test_size=0.4, random_state=0, shuffle=True, stratify=Y_data_galaxy) X_test_galaxy, X_valid_galaxy, Y_test_galaxy, Y_valid_galaxy = train_test_split( X_valid_galaxy, Y_valid_galaxy, test_size=0.5, random_state=0, shuffle=True, stratify=Y_valid_galaxy) # In[30]: from sklearn.metrics import accuracy_score # ============================================================================= # from sklearn.preprocessing import StandardScaler # Xtrain_galaxy_s=X_train_galaxy # Xtest_galaxy_s=X_test_galaxy # Xvalid_galaxy_s=X_valid_galaxy # ============================================================================= from mdlp.discretization import MDLP from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target mdlp = MDLP() conv_X = mdlp.fit_transform(X, y)
def get_discretizer(method='mdlp', *args, **kwargs): if method == 'mdlp': return MDLP(*args, **kwargs) else: raise ValueError("Not supporting method %s" % method)
#coding=utf-8 import numpy as np from mdlp.discretization import MDLP from sklearn.datasets import load_iris column = np.array([1,2]) transformer = MDLP(column) iris = load_iris() X, y = iris.data, iris.target print y print type(X), type(y) X_disc = transformer.fit_transform(X,y) conv_X = transformer.fit_transform(X, y) print conv_X di = transformer.cut_points_ print transformer.cut_points_ for each in di: print len(di[each])
df['Age'] = df['Age'].fillna(age_mean) df['Embarked'] = df['Embarked'].fillna(embark_mode) df['Cabin'] = df['Cabin'].fillna("U") df['Title'] = df['Name'].map(lambda x: substring_exist(x, TITLE_LIST)) df['Title'] = df.apply(replace_titles, axis=1) df['Embarked'] = df.apply(replace_embark, axis=1) df['Deck'] = df['Cabin'].map(lambda x: substring_exist(x, CABIN_LIST)) df['Deck'] = df.apply(replace_deck, axis=1) df['Family_Size'] = df['SibSp'] + df['Parch'] df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1) transformer = MDLP() X_age = df["Age"].to_numpy().reshape((df["Age"].shape[0], 1)) y_age = df["Survived"].to_numpy().reshape((df["Survived"].shape[0], 1)) disc = transformer.fit_transform(X_age, y_age) df["Age_disc"] = disc df['Sex'] = df.apply(replace_sex, axis=1) df["Pclass"] = df["Pclass"].map(lambda x: 1 / x) df.to_csv("./data/train_neat.csv") print(df.columns)