def test_fewer_bins_than_n_discretize_bins(): df = pd.read_csv("credit.csv") for n in range(2, 20, 5): bin_transformer_ = BinTransformer(n_discretize_bins=n) bin_transformer_.fit(df) for feat, bin_ranges in bin_transformer_.bins_.items(): assert len(bin_ranges) <= n
def test_correct_min_max_bins(): df = pd.read_csv("credit.csv") bin_transformer_ = BinTransformer() bin_transformer_.fit(df) for feat, bins in bin_transformer_.bins_.items(): assert bins[0][0] == df[feat].min() assert bins[-1][1] == df[feat].max()
def test_each_bin_in_order(): df = pd.read_csv("credit.csv") bin_transformer_ = BinTransformer() bin_transformer_.fit(df) for feat, bins in bin_transformer_.bins_.items(): for i, bin in enumerate(bins): assert (bin[0] <= bin[1])
def test_bin_ranges_are_flush(): df = pd.read_csv("credit.csv") bin_transformer_ = BinTransformer() bin_transformer_.fit(df) for feat, bin_ranges in bin_transformer_.bins_.items(): prev_ceil = None for floor, ceil in bin_ranges: assert prev_ceil is None or floor == prev_ceil prev_ceil = ceil
def preprocess_training_data(preprocess_params): # Get params trainset = preprocess_params["trainset"] y = preprocess_params["y"] class_feat = preprocess_params["class_feat"] pos_class = preprocess_params["pos_class"] user_requested_feature_names = preprocess_params["feature_names"] n_discretize_bins = preprocess_params["n_discretize_bins"] verbosity = preprocess_params["verbosity"] # Error check _check_valid_input_data( trainset, y, class_feat, user_requested_feature_names=user_requested_feature_names, ) # Determine class_feat class_feat = _get_class_feat_name(class_feat, y) # Build new DataFrame containing both X and y. df = _convert_to_training_df( trainset, y, class_feat, user_requested_feature_names=user_requested_feature_names, ) # Define pos_class pos_class = _get_pos_class(df, class_feat, pos_class) # Infer correct datatypes df = df.infer_objects() # Bin, if necessary bin_transformer_ = BinTransformer(n_discretize_bins=n_discretize_bins, verbosity=verbosity) df = bin_transformer_.fit_transform(df, ignore_feats=[class_feat]) # Done return df, class_feat, pos_class, bin_transformer_
def test_no_bins(): old_df = pd.read_csv("credit.csv") df = old_df.copy() bin_transformer_ = BinTransformer(n_discretize_bins=0) bin_transformer_.fit(df) bin_transformer_.transform(df) assert df.equals(old_df)
def _upgrade_bin_transformer_ifdepr(obj): old_bin_transformer_ = getattr(obj, "bin_transformer_") if type(old_bin_transformer_) == dict: new_bin_transformer_ = BinTransformer() new_bin_transformer_.bins_ = old_bin_transformer_ setattr(obj, "bin_transformer_", new_bin_transformer_)