def test_outlier(): with raises(ValueError): optb = OptimalBinning(outlier_detector="new_outlier") optb.fit(x, y) with raises(TypeError): optb = OptimalBinning(outlier_detector="range", outlier_params=[]) optb.fit(x, y) optb = OptimalBinning(outlier_detector="zscore", verbose=True) optb.fit(x, y) assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6) optb_eti = OptimalBinning(outlier_detector="range", outlier_params={"interval_length": 0.9, "method": "ETI"}) optb_hdi = OptimalBinning(outlier_detector="range", outlier_params={"interval_length": 0.9, "method": "HDI"}) for optb in [optb_eti, optb_hdi]: optb.fit(x, y) assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6)
def test_categorical_user_splits(): np.random.seed(0) n = 100000 x = sum([[i] * n for i in [-1, 2, 3, 4, 7, 8, 9, 10]], []) y = list(np.random.binomial(1, 0.011665, n)) y += list(np.zeros(n)) y += list(np.random.binomial(1, 0.0133333, n)) y += list(np.random.binomial(1, 0.166667, n)) y += list(np.zeros(n)) y += list(np.random.binomial(1, 0.0246041, n)) y += list(np.zeros(n)) y += list(np.random.binomial(1, 0.025641, n)) user_splits = [[2., 7., 9., 3., 10., 4.], [8], [-1]] user_splits_fixed = [True, True, True] optb1 = OptimalBinning(dtype="categorical", user_splits=user_splits) optb2 = OptimalBinning(dtype="categorical", user_splits=user_splits, user_splits_fixed=user_splits_fixed) for optb in (optb1, optb2): optb.fit(x, y) optb.binning_table.build() assert optb.binning_table.iv == approx(0.09345086993827473, rel=1e-6)
def test_numerical_min_max_n_bins(): optb_mip = OptimalBinning(solver="mip", min_n_bins=2, max_n_bins=5) optb_cp = OptimalBinning(solver="cp", min_n_bins=2, max_n_bins=5) for optb in [optb_mip, optb_cp]: optb.fit(x, y) assert optb.status == "OPTIMAL" assert 2 <= len(optb.splits + 1) <= 5
def test_numerical_regularization(): optb_mip = OptimalBinning(solver="mip", gamma=4) optb_cp = OptimalBinning(solver="cp", gamma=4) optb_mip.fit(x, y) optb_cp.fit(x, y) assert len(optb_mip.splits) < 6 assert len(optb_cp.splits) < 6
def test_categorical_default_user_splits(): x = np.array([ 'Working', 'State servant', 'Working', 'Working', 'Working', 'State servant', 'Commercial associate', 'State servant', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Working', 'Working', 'Working', 'Working', 'State servant', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Working', 'State servant', 'Working', 'Commercial associate', 'Working', 'Working', 'Commercial associate', 'State servant', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'State servant', 'Working', 'State servant', 'Commercial associate', 'Working', 'Commercial associate', 'Pensioner', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Commercial associate', 'Commercial associate', 'State servant', 'Working', 'Commercial associate', 'Commercial associate', 'Commercial associate', 'Working', 'Working', 'Working', 'Commercial associate', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner', 'Working', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'State servant', 'Working', 'Working', 'Working', 'Working', 'Working', 'Commercial associate', 'Commercial associate', 'Commercial associate', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner' ], dtype=object) y = np.array([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ]) optb = OptimalBinning(dtype="categorical", solver="mip", cat_cutoff=0.1, verbose=True) optb.fit(x, y) assert optb.status == "OPTIMAL" user_splits = [['Pensioner', 'Working'], ['Commercial associate'], ['State servant']] optb = OptimalBinning(dtype="categorical", solver="mip", cat_cutoff=0.1, user_splits=user_splits, verbose=True) optb.fit(x, y) assert optb.status == "OPTIMAL"
def test_numerical_default_solvers(): optb_mip_cbc = OptimalBinning(solver="mip", mip_solver="cbc") optb_mip_bop = OptimalBinning(solver="mip", mip_solver="bop") optb_cp = OptimalBinning(solver="cp") for optb in [optb_mip_bop, optb_mip_cbc, optb_cp]: optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6)
def test_auto_modes(): optb0 = OptimalBinning(monotonic_trend="auto") optb1 = OptimalBinning(monotonic_trend="auto_heuristic") optb2 = OptimalBinning(monotonic_trend="auto_asc_desc") optb3 = OptimalBinning(monotonic_trend="descending", verbose=True) for optb in [optb0, optb1, optb2, optb3]: optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6)
def test_numerical_user_splits(): user_splits = [11, 12, 13, 14, 15, 17] optb = OptimalBinning(user_splits=user_splits, max_pvalue=0.05) optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([13, 15, 17], rel=1e-6) optb.binning_table.build() assert optb.binning_table.iv == 4.819661314733627 optb = OptimalBinning(user_splits=user_splits, max_pvalue=0.05, max_pvalue_policy="all") optb.fit(x, y) optb.binning_table.build() assert optb.binning_table.iv == 4.819661314733627
def test_numerical_default(): optb = OptimalBinning() optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([ 11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019 ], rel=1e-6) optb.binning_table.build() assert optb.binning_table.iv == approx(5.04392547, rel=1e-6) optb.binning_table.analysis() assert optb.binning_table.gini == approx(0.87541620, rel=1e-6) assert optb.binning_table.js == approx(0.39378376, rel=1e-6) assert optb.binning_table.quality_score == approx(0.0, rel=1e-6) with raises(ValueError): optb.binning_table.plot(metric="new_metric") optb.binning_table.plot(metric="woe", savefig="test_binning.png") optb.binning_table.plot(metric="woe", add_special=False, savefig="test_binning_no_special.png") optb.binning_table.plot(metric="woe", add_missing=False, savefig="test_binning_no_missing.png")
def test_numerical_default_fit_transform(): optb = OptimalBinning() x_transform = optb.fit_transform(x, y, metric="woe") assert x_transform[:5] == approx( [5.28332344, 5.28332344, 5.28332344, -3.12517033, 5.28332344], rel=1e-6)
def test_numerical_prebinning_kwargs(): optb_kwargs = OptimalBinning(solver="mip", prebinning_method="mdlp", **{"max_candidates": 64}) optb_kwargs.fit(x, y) optb_kwargs.binning_table.build() assert optb_kwargs.binning_table.iv == approx(4.37337682, rel=1e-6)
def test_numerical_user_splits_fixed(): user_splits = [11, 12, 13, 14, 15, 16, 17] with raises(ValueError): user_splits_fixed = [False, False, False, False, False, True, False] optb = OptimalBinning(user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(TypeError): user_splits_fixed = (False, False, False, False, False, True, False) optb = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(ValueError): user_splits_fixed = [0, 0, 0, 0, 0, 1, 0] optb = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) with raises(ValueError): user_splits_fixed = [False, False, False, False] optb = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) user_splits_fixed = [False, False, False, False, False, True, False] optb = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed) optb.fit(x, y) assert optb.status == "INFEASIBLE" user_splits = [11, 12, 13, 14, 15, 17] user_splits_fixed = [False, True, False, False, False, False] optb_mip = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed, solver="mip") optb_cp = OptimalBinning(user_splits=user_splits, user_splits_fixed=user_splits_fixed, solver="cp") for optb in (optb_mip, optb_cp): optb.fit(x, y) assert optb.status == "OPTIMAL" assert 12 in optb.splits optb2 = OptimalBinning() optb2.fit(x, y) optb.binning_table.build() optb2.binning_table.build() assert optb.binning_table.iv <= optb2.binning_table.iv
def test_information(): optb = OptimalBinning(solver="cp") with raises(NotFittedError): optb.information() optb.fit(x, y) with raises(ValueError): optb.information(print_level=-1) optb.information(print_level=0) optb.information(print_level=1) optb.information(print_level=2) optb = OptimalBinning(solver="mip") optb.fit(x, y) optb.information(print_level=2)
def test_numerical_default_transform(): optb = OptimalBinning() with raises(NotFittedError): x_transform = optb.transform(x) optb.fit(x, y) x_transform = optb.transform([12, 14, 15, 21], metric="woe") assert x_transform == approx( [-2.71097154, -0.15397917, -0.15397917, 5.28332344], rel=1e-6)
def test_numerical_default(): optb = OptimalBinning() optb.fit(x, y) assert optb.status == "OPTIMAL" assert optb.splits == approx([ 11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019 ], rel=1e-6)
def test_default_fit_transform(): process = BinningProcess(variable_names) X_transform = process.fit_transform(X, y, metric="event_rate") optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x, metric="event_rate") == approx(X_transform[:, 5], rel=1e-6)
def test_default_transform(): process = BinningProcess(variable_names) with raises(NotFittedError): process.transform(X) process.fit(X, y) X_transform = process.transform(X) optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_default_transform_pandas(): df = pd.DataFrame(data.data, columns=data.feature_names) process = BinningProcess(variable_names) process.fit(df, y) with raises(TypeError): X_transform = process.transform(df.to_dict(), metric="woe") X_transform = process.transform(df, metric="woe") optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x, metric="woe") == approx( X_transform.values[:, 5], rel=1e-6)
def test_categorical_transform(): x = np.array([ 'Working', 'State servant', 'Working', 'Working', 'Working', 'State servant', 'Commercial associate', 'State servant', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Working', 'Working', 'Working', 'Working', 'State servant', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Working', 'State servant', 'Working', 'Commercial associate', 'Working', 'Working', 'Commercial associate', 'State servant', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'State servant', 'Working', 'State servant', 'Commercial associate', 'Working', 'Commercial associate', 'Pensioner', 'Working', 'Pensioner', 'Working', 'Working', 'Working', 'Commercial associate', 'Working', 'Pensioner', 'Working', 'Commercial associate', 'Commercial associate', 'State servant', 'Working', 'Commercial associate', 'Commercial associate', 'Commercial associate', 'Working', 'Working', 'Working', 'Commercial associate', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner', 'Working', 'Pensioner', 'Working', 'Working', 'Pensioner', 'Working', 'State servant', 'Working', 'Working', 'Working', 'Working', 'Working', 'Commercial associate', 'Commercial associate', 'Commercial associate', 'Working', 'Commercial associate', 'Working', 'Working', 'Pensioner' ], dtype=object) y = np.array([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ]) optb = OptimalBinning(dtype="categorical", solver="mip", cat_cutoff=0.1, verbose=True) optb.fit(x, y) x_transform = optb.transform( ["Pensioner", "Working", "Commercial associate", "State servant"]) assert x_transform == approx( [-0.26662866, 0.30873548, -0.55431074, 0.30873548], rel=1e-6)
def autoBinning(orginDf, yCol, isContinue=True): features = [] toDropColumns = [] for col in orginDf.columns: # print(modelInputs[col].dtype.kind) if orginDf[col].dtype.kind == 'f' or orginDf[col].dtype.kind == 'i': continue toDropColumns.append(col) print(toDropColumns) df = orginDf.drop(columns=toDropColumns) ivList = [] for col in df.columns: try: print(col) if (col == yCol): continue x = df[col].values y = df[yCol].values dtype = "numerical" if isCategray(df[col]): dtype = "categorical" optb = OptimalBinning(name=col, dtype=dtype, solver="cp") if isContinue: optb = ContinuousOptimalBinning(name=col, dtype=dtype,monotonic_trend='auto_asc_desc') optb.fit(x, y) # print(optb.status) binning_table = optb.binning_table binning_result = binning_table.build() print(binning_result) iv = binning_table.iv if iv > 0.02: ivList.append([col, iv]) features.append(col) runBackTest(orginDf, col, optb.splits) print(iv) except Exception as e: print(e) ivList.sort(key=lambda x: x[1], reverse=True) print(ivList) return features
def test_transform_some_variables(): process = BinningProcess(variable_names) process.fit(X, y) with raises(TypeError): process.transform(X, {}) with raises(ValueError): process.transform(X, ["new_1", "new_2"]) selected_variables = [ 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity' ] X_transform = process.transform(X, selected_variables) assert X_transform.shape[1] == 4 for i in range(3, 7): optb = OptimalBinning() x = X[:, i] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, i - 3], rel=1e-6)
def test_verbose(): optb = OptimalBinning(verbose=True) optb.fit(x, y) assert optb.status == "OPTIMAL"
def test_numerical_user_splits_non_unique(): user_splits = [11, 12, 13, 14, 15, 15] optb = OptimalBinning(user_splits=user_splits, max_pvalue=0.05) with raises(ValueError): optb.fit(x, y)
for variable in variables: #variable = "lead_jet_pt" print("\n\n ### Next Variable ### \n") print(" ..:: %s ::.. \n" % variable) #x_df = x_df[((x_df['score_best']==0) & (x_df['lead_lep_charge']>0))] x = x_df[variable].values weight = x_df['weight'].values * 137 #optb = OptimalBinning(name=variable, dtype="numerical", solver="cp") optb = OptimalBinning( name=variable, dtype="numerical", solver="cp", #prebinning_method='uniform', #max_n_prebins=100, min_n_bins=4, ) optb.fit(x, y, sample_weight=10 * weight) #optb.fit(x, y, sample_weight=weight) print("Fit status:") print(optb.status) print("Binning:") print(optb.splits) binning_table = optb.binning_table print(binning_table.build())
def optimal_binning(col, y ): optb = OptimalBinning(dtype='categorical', solver='cp', max_n_prebins = 80) optb.fit(col.values, y.values) return optb
def test_params(): with raises(TypeError): optb = OptimalBinning(name=1) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(dtype="nominal") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(prebinning_method="new_method") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(solver="new_solver") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_n_prebins=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_prebin_size=0.6) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_n_bins=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_n_bins=-2.2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_n_bins=3, max_n_bins=2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_size=0.6) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_bin_size=-0.6) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_size=0.5, max_bin_size=0.3) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_n_nonevent=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_bin_n_nonevent=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_n_nonevent=3, max_bin_n_nonevent=2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_n_event=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_bin_n_event=-2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_bin_n_event=3, max_bin_n_event=2) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(monotonic_trend="new_trend") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(min_event_rate_diff=1.1) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_pvalue=1.1) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(max_pvalue_policy="new_policy") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(gamma=-0.2) optb.fit(x, y) with raises(TypeError): optb = OptimalBinning(class_weight=[0, 1]) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(class_weight="unbalanced") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(cat_cutoff=-0.2) optb.fit(x, y) with raises(TypeError): optb = OptimalBinning(user_splits={"a": [1, 2]}) optb.fit(x, y) with raises(TypeError): optb = OptimalBinning(special_codes={1, 2, 3}) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(split_digits=9) optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(mip_solver="new_solver") optb.fit(x, y) with raises(ValueError): optb = OptimalBinning(time_limit=-2) optb.fit(x, y) with raises(TypeError): optb = OptimalBinning(verbose=1) optb.fit(x, y)
def splitData(self, data, availableAttributes, numericAttrBinning, repeatAttributes, minNumRecordsLeafNode): '''Given a list of available attributes chooses a split that has the largest information gain. Returns the chosen attribute, the subsets of the dataframe resulting from the split, the best split threshold and the ranges for each subset''' bestGain = -np.inf bestSubsets = None splitAttrib = None bestSplitThreshold = None bestRanges = None for attr in availableAttributes: # if attr is discrete attribute with z values if str(data[attr].dtype) == 'object' or str( data[attr].dtype) == 'category': if len(set(data[attr])) == 1: continue # skip if only one category grouped = data.groupby(attr) # get values for binning x = data[attr].values y = data.iloc[:, -1].values # type of binning is determined by tree type if self.treeType == 'classification': optb = OptimalBinning(dtype='categorical', min_n_bins=2, max_n_bins=4) else: optb = ContinuousOptimalBinning(dtype='categorical', min_n_bins=2, max_n_bins=4, min_prebin_size=0.001) optb.fit(x, y) binningResultDt = optb.binning_table.build() bins = binningResultDt['Bin'].head(-3) # create susbset for each bin if target var is binary and there are multiple bins if (len(self.classes) == 2 and len(bins) > 1): # Binary targret variable subsets = [ pd.concat([grouped.get_group(cat) for cat in bin]) for bin in bins ] else: # otherwise create subset for each value (category) of the attribute subsets = [ grouped.get_group(x) for x in data[attr].unique() ] if any( len(subset) for subset in subsets if len(subset) < minNumRecordsLeafNode): continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain(data, subsets) else: infoGain = self.calculateStandardDeviationReduction( data, subsets) if infoGain >= bestGain: bestGain = infoGain bestSubsets = subsets splitAttrib = attr bestSplitThreshold = None bestRanges = None else: # if attr has numeric values onlyOneBin = False # get values for binning x = data[attr].values y = data.iloc[:, -1].values # type of binning is determined by tree type if self.treeType == 'classification': optb = MulticlassOptimalBinning(min_n_bins=2, max_n_bins=4) else: if x.min() == x.max(): continue optb = ContinuousOptimalBinning(min_n_bins=2, max_n_bins=4, min_prebin_size=0.001) optb.fit(x, y) binningResultDt = optb.binning_table.build() bins = binningResultDt['Bin'].head(-3) if len(bins) == 1: onlyOneBin = True # if user enabled numeric attribue binning and there are multiple bins if numericAttrBinning is True and onlyOneBin is False: # modify range string representation so it can be parsed bins.iloc[0] = bins.iloc[0].replace('-inf', "'-inf'") bins.iloc[-1] = bins.iloc[-1].replace('inf', "'inf'") # create list of tuples for every range ranges = [literal_eval(x.replace('[', '(')) for x in bins] # replace 'inf' strigns with np.inf ranges = [(-np.inf, x[1]) if x[0] == '-inf' else ((x[0], np.inf) if x[1] == 'inf' else (x[0], x[1])) for x in ranges] # create subsets according to the ranges subsets = [ data.loc[(data[attr] >= r[0]) & (data[attr] < r[1])] for r in ranges ] if any( len(subset) for subset in subsets if len(subset) < minNumRecordsLeafNode): continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain(data, subsets) else: infoGain = self.calculateStandardDeviationReduction( data, subsets) if infoGain >= bestGain: bestGain = infoGain bestSubsets = subsets splitAttrib = attr bestSplitThreshold = None bestRanges = ranges else: # binary split using threshold sortedData = data.sort_values(attr) # sort data by attr for i in range(len(sortedData[attr]) - 1): # for each entry (without the last one) # if current and next value of attr are equal - do nothing if sortedData[attr].iloc[i] == sortedData[attr].iloc[ i + 1]: continue # calculate threshold and use it to create two subsets currentThreshold = (sortedData[attr].iloc[i] + sortedData[attr].iloc[i + 1]) / 2 lowerSubset = sortedData[ sortedData[attr] <= currentThreshold] higherSubset = sortedData[ sortedData[attr] > currentThreshold] if len(lowerSubset) < minNumRecordsLeafNode or len( higherSubset) < minNumRecordsLeafNode: continue # skip if there are too small subsets if self.treeType == 'classification': infoGain = self.calculateInformationGain( data, [lowerSubset, higherSubset]) else: infoGain = self.calculateStandardDeviationReduction( data, [lowerSubset, higherSubset]) if infoGain > bestGain: bestGain = infoGain bestSubsets = [lowerSubset, higherSubset] splitAttrib = attr bestSplitThreshold = currentThreshold bestRanges = None # fix ranges if repeatingAttributes if bestRanges and repeatAttributes: parentRanges = self.numericAttrRanges[splitAttrib][0][ self.numericAttrRanges[splitAttrib][1]] checkValue = data[splitAttrib].iloc[0] parentRange = next(rng for rng in parentRanges if checkValue >= rng[0] and checkValue < rng[1]) bestRanges[0] = (parentRange[0], bestRanges[0][1]) bestRanges[-1] = (bestRanges[-1][0], parentRange[-1]) self.numericAttrRanges[splitAttrib][1] += 1 if self.numericAttrRanges[splitAttrib][1] in range( 0, len(self.numericAttrRanges[splitAttrib][0])): self.numericAttrRanges[splitAttrib][0][ self.numericAttrRanges[splitAttrib][1]] = bestRanges else: self.numericAttrRanges[splitAttrib][0].append(bestRanges) return (splitAttrib, bestSubsets, bestSplitThreshold, bestRanges, bestGain)