def test_numerical_default_fit_transform(): optb = MulticlassOptimalBinning() x_transform = optb.fit_transform(x, y, metric="mean_woe") assert x_transform[:5] == approx( [-0.00074357, 0.48973998, 0.02189459, -0.00074357, 0.02189459], rel=1e-5)
def test_numerical_default(): optb = MulticlassOptimalBinning() optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx( [2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6)
def test_numerical_default(): optb = MulticlassOptimalBinning() optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6) optb.binning_table.build() optb.binning_table.analysis() assert optb.binning_table.js == approx(0.10989515, rel=1e-6) assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6)
def test_numerical_default(): optb = MulticlassOptimalBinning() optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx( [2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6) optb.binning_table.build() optb.binning_table.analysis() assert optb.binning_table.js == approx(0.10989515, rel=1e-6) assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6) optb.binning_table.plot(savefig="test_multiclass_binning.png") optb.binning_table.plot(add_special=False, savefig="test_multiclass_binning_no_special.png") optb.binning_table.plot(add_missing=False, savefig="test_multiclass_binning_no_missing.png")
def test_default_transform_multiclass(): data = load_wine() variable_names = data.feature_names X = data.data y = data.target process = BinningProcess(variable_names) process.fit(X, y) X_transform = process.transform(X) optb = process.get_binned_variable(variable_names[0]) assert isinstance(optb, MulticlassOptimalBinning) optb = MulticlassOptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_numerical_default_solvers(): optb_mip_bop = MulticlassOptimalBinning(solver="mip", mip_solver="bop") optb_mip_bop.fit(x, y) optb_cp = MulticlassOptimalBinning(solver="cp") optb_cp.fit(x, y) for optb in [optb_mip_bop, optb_cp]: assert optb.status == "OPTIMAL" assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999, 2.6450001], rel=1e-6)
def test_numerical_default_transform(): optb = MulticlassOptimalBinning() with raises(NotFittedError): x_transform = optb.transform(x) optb.fit(x, y) x_transform = optb.transform([0.3, 2.1, 2.5, 3], metric="mean_woe") assert x_transform == approx([0.48973998, 0.48973998, -0.00074357, 0.02189459], rel=1e-5)
def splitData(self, data, availableAttributes, numericAttrBinning, repeatAttributes, minNumRecordsLeafNode): '''Given a list of available attributes chooses a split that has the largest information gain. Returns the chosen attribute, the subsets of the dataframe resulting from the split, the best split threshold and the ranges for each subset''' bestGain = -np.inf bestSubsets = None splitAttrib = None bestSplitThreshold = None bestRanges = None for attr in availableAttributes: # if attr is discrete attribute with z values if str(data[attr].dtype) == 'object' or str( data[attr].dtype) == 'category': if len(set(data[attr])) == 1: continue # skip if only one category grouped = data.groupby(attr) # get values for binning x = data[attr].values y = data.iloc[:, -1].values # type of binning is determined by tree type if self.treeType == 'classification': optb = OptimalBinning(dtype='categorical', min_n_bins=2, max_n_bins=4) else: optb = ContinuousOptimalBinning(dtype='categorical', min_n_bins=2, max_n_bins=4, min_prebin_size=0.001) optb.fit(x, y) binningResultDt = optb.binning_table.build() bins = binningResultDt['Bin'].head(-3) # create susbset for each bin if target var is binary and there are multiple bins if (len(self.classes) == 2 and len(bins) > 1): # Binary targret variable subsets = [ pd.concat([grouped.get_group(cat) for cat in bin]) for bin in bins ] else: # otherwise create subset for each value (category) of the attribute subsets = [ grouped.get_group(x) for x in data[attr].unique() ] if any( len(subset) for subset in subsets if len(subset) < minNumRecordsLeafNode): continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain(data, subsets) else: infoGain = self.calculateStandardDeviationReduction( data, subsets) if infoGain >= bestGain: bestGain = infoGain bestSubsets = subsets splitAttrib = attr bestSplitThreshold = None bestRanges = None else: # if attr has numeric values onlyOneBin = False # get values for binning x = data[attr].values y = data.iloc[:, -1].values # type of binning is determined by tree type if self.treeType == 'classification': optb = MulticlassOptimalBinning(min_n_bins=2, max_n_bins=4) else: if x.min() == x.max(): continue optb = ContinuousOptimalBinning(min_n_bins=2, max_n_bins=4, min_prebin_size=0.001) optb.fit(x, y) binningResultDt = optb.binning_table.build() bins = binningResultDt['Bin'].head(-3) if len(bins) == 1: onlyOneBin = True # if user enabled numeric attribue binning and there are multiple bins if numericAttrBinning is True and onlyOneBin is False: # modify range string representation so it can be parsed bins.iloc[0] = bins.iloc[0].replace('-inf', "'-inf'") bins.iloc[-1] = bins.iloc[-1].replace('inf', "'inf'") # create list of tuples for every range ranges = [literal_eval(x.replace('[', '(')) for x in bins] # replace 'inf' strigns with np.inf ranges = [(-np.inf, x[1]) if x[0] == '-inf' else ((x[0], np.inf) if x[1] == 'inf' else (x[0], x[1])) for x in ranges] # create subsets according to the ranges subsets = [ data.loc[(data[attr] >= r[0]) & (data[attr] < r[1])] for r in ranges ] if any( len(subset) for subset in subsets if len(subset) < minNumRecordsLeafNode): continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain(data, subsets) else: infoGain = self.calculateStandardDeviationReduction( data, subsets) if infoGain >= bestGain: bestGain = infoGain bestSubsets = subsets splitAttrib = attr bestSplitThreshold = None bestRanges = ranges else: # binary split using threshold sortedData = data.sort_values(attr) # sort data by attr for i in range(len(sortedData[attr]) - 1): # for each entry (without the last one) # if current and next value of attr are equal - do nothing if sortedData[attr].iloc[i] == sortedData[attr].iloc[ i + 1]: continue # calculate threshold and use it to create two subsets currentThreshold = (sortedData[attr].iloc[i] + sortedData[attr].iloc[i + 1]) / 2 lowerSubset = sortedData[ sortedData[attr] <= currentThreshold] higherSubset = sortedData[ sortedData[attr] > currentThreshold] if len(lowerSubset) < minNumRecordsLeafNode or len( higherSubset) < minNumRecordsLeafNode: continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain( data, [lowerSubset, higherSubset]) else: infoGain = self.calculateStandardDeviationReduction( data, [lowerSubset, higherSubset]) if infoGain > bestGain: bestGain = infoGain bestSubsets = [lowerSubset, higherSubset] splitAttrib = attr bestSplitThreshold = currentThreshold bestRanges = None # fix ranges if repeatingAttributes if bestRanges and repeatAttributes: parentRanges = self.numericAttrRanges[splitAttrib][0][ self.numericAttrRanges[splitAttrib][1]] checkValue = data[splitAttrib].iloc[0] parentRange = next(rng for rng in parentRanges if checkValue >= rng[0] and checkValue < rng[1]) bestRanges[0] = (parentRange[0], bestRanges[0][1]) bestRanges[-1] = (bestRanges[-1][0], parentRange[-1]) self.numericAttrRanges[splitAttrib][1] += 1 if self.numericAttrRanges[splitAttrib][1] in range( 0, len(self.numericAttrRanges[splitAttrib][0])): self.numericAttrRanges[splitAttrib][0][ self.numericAttrRanges[splitAttrib][1]] = bestRanges else: self.numericAttrRanges[splitAttrib][0].append(bestRanges) return (splitAttrib, bestSubsets, bestSplitThreshold, bestRanges, bestGain)
def test_params(): with raises(TypeError): optb = MulticlassOptimalBinning(name=1) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(prebinning_method="new_method") optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(solver="new_solver") optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(max_n_prebins=-2) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(min_prebin_size=0.6) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(min_n_bins=-2) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(max_n_bins=-2.2) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(min_n_bins=3, max_n_bins=2) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(min_bin_size=0.6) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(max_bin_size=-0.6) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(min_bin_size=0.5, max_bin_size=0.3) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(monotonic_trend=["new_trend", "auto"]) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(monotonic_trend="new_trend") optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(max_pvalue=1.1) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(max_pvalue_policy="new_policy") optb.fit(x, y) with raises(TypeError): optb = MulticlassOptimalBinning(user_splits={"a": [1, 2]}) optb.fit(x, y) with raises(TypeError): optb = MulticlassOptimalBinning(special_codes={1, 2, 3}) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(split_digits=9) optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(mip_solver="new_solver") optb.fit(x, y) with raises(ValueError): optb = MulticlassOptimalBinning(time_limit=-2) optb.fit(x, y) with raises(TypeError): optb = MulticlassOptimalBinning(verbose=1) optb.fit(x, y)
def test_verbose(): optb = MulticlassOptimalBinning(verbose=True) optb.fit(x, y) assert optb.status == "OPTIMAL"
def test_classes(): optb = MulticlassOptimalBinning() optb.fit(x, y) assert optb.classes == approx([0, 1, 2])
def test_numerical_user_splits_fixed(): user_splits = [2.1, 2.2, 2.3, 2.6, 2.9] with raises(ValueError): user_splits_fixed = [False, False, False, True, False] optb = MulticlassOptimalBinning(user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(TypeError): user_splits_fixed = (False, False, False, True, False) optb = MulticlassOptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(ValueError): user_splits_fixed = [0, 0, 0, 1, 0] optb = MulticlassOptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(ValueError): user_splits_fixed = [False, False, False, False] optb = MulticlassOptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) user_splits_fixed = [False, False, False, True, True] with raises(ValueError): # pure pre-bins optb = MulticlassOptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) user_splits = [2.1, 2.2, 2.3, 2.6, 2.7] optb = MulticlassOptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) assert optb.status == "OPTIMAL" assert 2.7 in optb.splits
def test_numerical_user_splits_non_unique(): user_splits = [2.1, 2.2, 2.2, 2.6, 2.9] optb = MulticlassOptimalBinning(user_splits=user_splits) with raises(ValueError): optb.fit(x, y)