def create_documentation_page_nf(): noise_filters = sv.get_all_noisefilters() docs = "Noise filters and prototype selection\n" docs = docs + "*" * len("Noise filters and prototype selection") + "\n\n" for o in noise_filters: docs = docs + o.__name__ + "\n" + '=' * len(o.__name__) + "\n" docs = docs + "\n\n" docs = docs + "API\n" docs = docs + "^" * len("API") + "\n\n" docs = docs + ('.. autoclass:: smote_variants.%s' % o.__name__) + "\n" docs = docs + (' :members:') + "\n" docs = docs + "\n" docs = docs + (' .. automethod:: __init__') docs = docs + "\n\n" docs = docs + "Example\n" docs = docs + "^" * len("Example") docs = docs + "\n\n" docs = docs + (" >>> noise_filter= smote_variants.%s()\n" % o.__name__) docs = docs + " >>> X_samp, y_samp= noise_filter.remove_noise(X, y)\n" docs = docs + "\n\n" docs = docs + ".. image:: figures/base.png" + "\n" docs = docs + (".. image:: figures/%s.png" % o.__name__) + "\n\n" docs = docs + o.__doc__.replace("\n ", "\n") file = open("noise_filters.rst", "w") file.write(docs) file.close() return docs
def generate_figures(): oversamplers = sv.get_all_oversamplers() for o in oversamplers: ballpark_sample(o(), img_file_base='figures/base.png', img_file_sampled=('figures/%s.png' % o.__name__)) noisefilters = sv.get_all_noisefilters() for n in noisefilters: ballpark_sample(n(), img_file_base='figures/base.png', img_file_sampled=('figures/%s.png' % n.__name__))
def test_normal(self): X = np.vstack([data_min, data_maj]) y = np.hstack( [np.repeat(1, len(data_min)), np.repeat(0, len(data_maj))]) samplers = sv.get_all_oversamplers() for s in samplers: logging.info("testing %s" % str(s)) X_samp, y_samp = s().sample(X, y) self.assertTrue(len(X_samp) > 0) samplers_plus = [ sv.polynom_fit_SMOTE(topology='star'), sv.polynom_fit_SMOTE(topology='bus'), sv.polynom_fit_SMOTE(topology='mesh'), sv.polynom_fit_SMOTE(topology='poly_2'), sv.Stefanowski(strategy='weak_amp'), sv.Stefanowski(strategy='weak_amp_relabel'), sv.Stefanowski(strategy='strong_amp'), sv.G_SMOTE(method='non-linear_2.0'), sv.SMOTE_PSOBAT(method='pso'), sv.AHC(strategy='maj'), sv.AHC(strategy='minmaj'), sv.SOI_CJ(method='jittering'), sv.ADG(kernel='rbf_1'), sv.SMOTE_IPF(voting='consensus'), sv.ASMOBD(smoothing='sigmoid') ] for s in samplers_plus: logging.info("testing %s" % str(s.__class__.__name__)) X_samp, y_samp = s.sample(X, y) self.assertTrue(len(X_samp) > 0) nf = sv.get_all_noisefilters() for n in nf: logging.info("testing %s" % str(n)) X_nf, y_nf = n().remove_noise(X, y) self.assertTrue(len(X_nf) > 0)
def test_normal(): data_min = np.array([[5.7996138, -0.25574582], [3.0637093, 2.11750874], [4.91444087, -0.72380123], [1.06414164, 0.08694243], [2.59071708, 0.75283568], [3.44834937, 1.46118085], [2.8036378, 0.69553702], [3.57901791, 0.71870743], [3.81529064, 0.62580927], [3.05005506, 0.33290343], [1.83674689, 1.06998465], [2.08574889, -0.32686821], [3.49417022, -0.92155623], [2.33920982, -1.59057568], [1.95332431, -0.84533309], [3.35453368, -1.10178101], [4.20791149, -1.41874985], [2.25371221, -1.45181929], [2.87401694, -0.74746037], [1.84435381, 0.15715329]]) data_maj = np.array([[-1.40972752, 0.07111486], [-1.1873495, -0.20838002], [0.51978825, 2.1631319], [-0.61995016, -0.45111475], [2.6093289, -0.40993063], [-0.06624482, -0.45882838], [-0.28836659, -0.59493865], [0.345051, 0.05188811], [1.75694985, 0.16685025], [0.52901288, -0.62341735], [0.09694047, -0.15811278], [-0.37490451, -0.46290818], [-0.32855088, -0.20893795], [-0.98508364, -0.32003935], [0.07579831, 1.36455355], [-1.44496689, -0.44792395], [1.17083343, -0.15804265], [1.73361443, -0.06018163], [-0.05139342, 0.44876765], [0.33731075, -0.06547923], [-0.02803696, 0.5802353], [0.20885408, 0.39232885], [0.22819482, 2.47835768], [1.48216063, 0.81341279], [-0.6240829, -0.90154291], [0.54349668, 1.4313319], [-0.65925018, 0.78058634], [-1.65006105, -0.88327625], [-1.49996313, -0.99378106], [0.31628974, -0.41951526], [0.64402186, 1.10456105], [-0.17725369, -0.67939216], [0.12000555, -1.18672234], [2.09793313, 1.82636262], [-0.11711376, 0.49655609], [1.40513236, 0.74970305], [2.40025472, -0.5971392], [-1.04860983, 2.05691699], [0.74057019, -1.48622202], [1.32230881, -2.36226588], [-1.00093975, -0.44426212], [-2.25927766, -0.55860504], [-1.12592836, -0.13399132], [0.14500925, -0.89070934], [0.90572513, 1.23923502], [-1.25416346, -1.49100593], [0.51229813, 1.54563048], [-1.36854287, 0.0151081], [0.08169257, -0.69722099], [-0.73737846, 0.42595479], [0.02465411, -0.36742946], [-1.14532211, -1.23217124], [0.98038343, 0.59259824], [-0.20721222, 0.68062552], [-2.21596433, -1.96045872], [-1.20519292, -1.8900018], [0.47189299, -0.4737293], [1.18196143, 0.85320018], [0.03255894, -0.77687178], [0.32485141, -0.34609381]]) X = np.vstack([data_min, data_maj]) y = np.hstack([np.repeat(1, len(data_min)), np.repeat(0, len(data_maj))]) samplers = sv.get_all_oversamplers() for s in samplers: logging.info("testing %s" % str(s)) X_samp, y_samp = s().sample(X, y) assert len(X_samp) > 0 samplers_plus = [ sv.polynom_fit_SMOTE(topology='star'), sv.polynom_fit_SMOTE(topology='bus'), sv.polynom_fit_SMOTE(topology='mesh'), sv.polynom_fit_SMOTE(topology='poly_2'), sv.Stefanowski(strategy='weak_amp'), sv.Stefanowski(strategy='weak_amp_relabel'), sv.Stefanowski(strategy='strong_amp'), sv.G_SMOTE(method='non-linear_2.0'), sv.SMOTE_PSOBAT(method='pso'), sv.AHC(strategy='maj'), sv.AHC(strategy='minmaj'), sv.SOI_CJ(method='jittering'), sv.ADG(kernel='rbf_1'), sv.SMOTE_IPF(voting='consensus'), sv.ASMOBD(smoothing='sigmoid') ] for s in samplers_plus: logging.info("testing %s" % str(s.__class__.__name__)) X_samp, y_samp = s.sample(X, y) assert len(X_samp) > 0 nf = sv.get_all_noisefilters() for n in nf: logging.info("testing %s" % str(n)) X_nf, y_nf = n().remove_noise(X, y) assert len(X_nf) > 0
def test_queries(): assert len(sv.get_all_oversamplers()) > 0 assert len(sv.get_all_noisefilters()) > 0 assert len(sv.get_n_quickest_oversamplers(5)) == 5 assert len(sv.get_all_oversamplers_multiclass()) > 0 assert len(sv.get_n_quickest_oversamplers_multiclass(5)) == 5
def test_reproducibility(self): X = np.vstack([data_min, data_maj]) y = np.hstack( [np.repeat(1, len(data_min)), np.repeat(0, len(data_maj))]) samplers = sv.get_all_oversamplers() for s in samplers: logging.info("testing %s" % str(s)) X_orig = X.copy() y_orig = y.copy() X_samp_a, y_samp_a = s(random_state=5).sample(X, y) sampler = s(random_state=5) X_samp_b, y_samp_b = sampler.sample(X, y) X_samp_c, y_samp_c = s(**sampler.get_params()).sample(X, y) self.assertTrue(np.array_equal(X_samp_a, X_samp_b)) self.assertTrue(np.array_equal(X_samp_a, X_samp_c)) self.assertTrue(np.array_equal(X_orig, X)) samplers_plus = [ sv.polynom_fit_SMOTE(topology='star', random_state=5), sv.polynom_fit_SMOTE(topology='bus', random_state=5), sv.polynom_fit_SMOTE(topology='mesh', random_state=5), sv.polynom_fit_SMOTE(topology='poly_2', random_state=5), sv.Stefanowski(strategy='weak_amp', random_state=5), sv.Stefanowski(strategy='weak_amp_relabel', random_state=5), sv.Stefanowski(strategy='strong_amp', random_state=5), sv.G_SMOTE(method='non-linear_2.0', random_state=5), sv.SMOTE_PSOBAT(method='pso', random_state=5), sv.AHC(strategy='maj', random_state=5), sv.AHC(strategy='minmaj', random_state=5), sv.SOI_CJ(method='jittering', random_state=5), sv.ADG(kernel='rbf_1', random_state=5), sv.SMOTE_IPF(voting='consensus', random_state=5), sv.ASMOBD(smoothing='sigmoid', random_state=5) ] for s in samplers_plus: logging.info("testing %s" % str(s.__class__.__name__)) X_orig = X.copy() y_orig = y.copy() X_samp_a, y_samp_a = s.sample(X, y) sc = s.__class__(**s.get_params()) X_samp_b, y_samp_b = sc.sample(X, y) self.assertTrue(np.array_equal(X_samp_a, X_samp_b)) self.assertTrue(np.array_equal(X_orig, X)) nf = sv.get_all_noisefilters() for n in nf: logging.info("testing %s" % str(n)) X_orig, y_orig = X.copy(), y.copy() nf = n() X_nf_a, y_nf_a = nf.remove_noise(X, y) nf_b = n(**nf.get_params()) X_nf_b, y_nf_b = nf_b.remove_noise(X, y) self.assertTrue(np.array_equal(X_nf_a, X_nf_b)) self.assertTrue(np.array_equal(X_orig, X))
def test_queries(self): self.assertTrue(len(sv.get_all_oversamplers()) > 0) self.assertTrue(len(sv.get_all_noisefilters()) > 0) self.assertTrue(len(sv.get_n_quickest_oversamplers(5)) == 5) self.assertTrue(len(sv.get_all_oversamplers_multiclass()) > 0) self.assertTrue(len(sv.get_n_quickest_oversamplers_multiclass(5)) == 5)
def create_gallery_page(): oversamplers = sv.get_all_oversamplers() noise_filters = sv.get_all_noisefilters() docs = "Gallery\n" + '*' * len('Gallery\n') + "\n\n" docs = docs + "In this page, we demonstrate the output of various oversampling \ and noise removal techniques, using default parameters.\n\n" docs = docs + "For binary oversampling and nosie removal, an artificial database was used, available in the ``utils` directory of the github repository.\n\n" #docs= docs + "For binary oversampling and noise removal, the figures can be reproduced by the ``ballpark_sample`` function using \ # a built-in or user definied dataset:\n\n" #docs= docs + ".. autofunction:: smote_variants.ballpark_sample\n\n" docs = docs + "For multiclass oversampling we have used the 'wine' dataset from \ ``sklearn.datasets``, which has 3 classes and many features, out \ which the first two coordinates have been used for visualization.\n\n" docs = docs + "Oversampling sample results\n" docs = docs + "=" * len('Oversampling sample results\n') + "\n\n" docs = docs + "In the captions of the images some abbreviations \ referring to the operating principles are placed. Namely:\n\n" docs = docs + " * NR: noise removal is involved\n" docs = docs + " * DR: dimension reduction is applied\n" docs = docs + " * Clas: some supervised classifier is used\n" docs = docs + " * SCmp: sampling is carried out componentwise (attributewise)\n" docs = docs + " * SCpy: sampling is carried out by copying instances\n" docs = docs + " * SO: ordinary sampling (just like in SMOTE)\n" docs = docs + " * M: memetic optimization is used\n" docs = docs + " * DE: density estimation is used\n" docs = docs + " * DB: density based - the sampling is based on a density of importance assigned to the instances\n" docs = docs + " * Ex: the sampling is extensive - samples are added successively, not optimizing the holistic distribution of a given number of samples\n" docs = docs + " * CM: changes majority - even majority samples can change\n" docs = docs + " * Clus: uses some clustering technique\n" docs = docs + " * BL: identifies and samples the neighborhoods of borderline samples\n" docs = docs + " * A: developed for a specific application\n" docs = docs + "\n" docs = docs + ".. figure:: figures/base.png" + "\n\n\n" i = 0 for o in oversamplers: docs = docs + (".. image:: figures/%s.png\n" % o.__name__) i = i + 1 if i % 4 == 0: docs = docs + "\n" docs = docs + "Noise removal sample results\n" docs = docs + "=" * len('Noise removal sample results\n') + "\n\n" docs = docs + ".. figure:: figures/base.png" + "\n\n\n" i = 0 for n in noise_filters: docs = docs + (".. image:: figures/%s.png\n" % n.__name__) i = i + 1 if i % 4 == 0: docs = docs + "\n" docs = docs + "Multiclass sample results\n" docs = docs + "=" * len('Multiclass sample results\n') + "\n\n" docs = docs + ".. figure:: figures/multiclass-base.png" + "\n\n\n" oversamplers = [ o for o in oversamplers if not sv.OverSampling.cat_changes_majority in o.categories and 'proportion' in o().get_params() ] i = 0 for o in oversamplers: docs = docs + (".. image:: figures/multiclass-%s.png\n" % o.__name__) i = i + 1 if i % 4 == 0: docs = docs + "\n" file = open("gallery.rst", "w") file.write(docs) file.close() return docs