def test_fast_cvm(n_samples=1000): random = RandomState() data1 = random.uniform(size=n_samples) weights1 = random.uniform(size=n_samples) mask = random.uniform(size=n_samples) > 0.5 data2 = data1[mask] weights2 = weights1[mask] a = cvm_2samp(data1, data2, weights1, weights2) prepared_data1, prepared_weights1, F1 = prepare_distribution(data1, weights1) b = _cvm_2samp_fast(prepared_data1, data2, prepared_weights1, weights2, cdf1=F1) assert numpy.allclose(a, b)
def test_ks2samp_fast(size=1000): y1 = RandomState().uniform(size=size) y2 = y1[RandomState().uniform(size=size) > 0.5] a = ks_2samp(y1, y2)[0] prep_data, prep_weights, prep_F = prepare_distribution(y1, numpy.ones(len(y1))) b = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F) c = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F) d = ks_2samp_weighted(y1, y2, numpy.ones(len(y1)) / 3, numpy.ones(len(y2)) / 4) assert numpy.allclose(a, b, rtol=1e-2, atol=1e-3) assert numpy.allclose(b, c) assert numpy.allclose(b, d) print('ks2samp is ok')
def group_based_cvm(y_pred, mask, sample_weight, groups_indices): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distribution(y_pred[mask], weights=sample_weight[mask]) for group, group_weight in zip(groups_indices, group_weights): local_distribution = y_pred[group] local_weights = sample_weight[group] result += group_weight * _cvm_2samp_fast(global_data, local_distribution, global_weight, local_weights, global_F) return result
def groups_based_ks(y_pred, mask, sample_weight, groups_indices): """Kolmogorov-Smirnov flatness on groups """ assert len(y_pred) == len(sample_weight) == len(mask) group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight) prepared_data, prepared_weight, prep_F = prepare_distribution(y_pred[mask], weights=sample_weight[mask]) result = 0. for group_weight, group_indices in zip(group_weights, groups_indices): local_distribution = y_pred[group_indices] local_weights = sample_weight[group_indices] result += group_weight * \ _ks_2samp_fast(prepared_data, local_distribution, prepared_weight, local_weights, prep_F) return result
def groups_based_ks(y_pred, mask, sample_weight, groups_indices): """Kolmogorov-Smirnov flatness on groups """ assert len(y_pred) == len(sample_weight) == len(mask) group_weights = compute_group_weights_by_indices( groups_indices, sample_weight=sample_weight) prepared_data, prepared_weight, prep_F = prepare_distribution( y_pred[mask], weights=sample_weight[mask]) result = 0. for group_weight, group_indices in zip(group_weights, groups_indices): local_distribution = y_pred[group_indices] local_weights = sample_weight[group_indices] result += group_weight * \ _ks_2samp_fast(prepared_data, local_distribution, prepared_weight, local_weights, prep_F) return result
def group_based_cvm(y_pred, mask, sample_weight, groups_indices): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights_by_indices( groups_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distribution( y_pred[mask], weights=sample_weight[mask]) for group, group_weight in zip(groups_indices, group_weights): local_distribution = y_pred[group] local_weights = sample_weight[group] result += group_weight * _cvm_2samp_fast( global_data, local_distribution, global_weight, local_weights, global_F) return result
def bin_based_cvm(y_pred, sample_weight, bin_indices): """Cramer-von Mises similarity, quite slow meanwhile""" assert len(y_pred) == len(sample_weight) == len(bin_indices) bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distribution(y_pred, weights=sample_weight) for bin, bin_weight in enumerate(bin_weights): if bin_weight <= 0: continue bin_mask = bin_indices == bin local_distribution = y_pred[bin_mask] local_weights = sample_weight[bin_mask] result += bin_weight * _cvm_2samp_fast(global_data, local_distribution, global_weight, local_weights, global_F) return result
def bin_based_ks(y_pred, mask, sample_weight, bin_indices): """Kolmogorov-Smirnov flatness on bins""" assert len(y_pred) == len(sample_weight) == len(bin_indices) == len(mask) y_pred = y_pred[mask] sample_weight = sample_weight[mask] bin_indices = bin_indices[mask] bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight) prepared_data, prepared_weight, prep_F = prepare_distribution(y_pred, weights=sample_weight) result = 0. for bin, bin_weight in enumerate(bin_weights): if bin_weight <= 0: continue local_distribution = y_pred[bin_indices == bin] local_weights = sample_weight[bin_indices == bin] result += bin_weight * \ _ks_2samp_fast(prepared_data, local_distribution, prepared_weight, local_weights, prep_F) return result
def bin_based_cvm(y_pred, sample_weight, bin_indices): """Cramer-von Mises similarity, quite slow meanwhile""" assert len(y_pred) == len(sample_weight) == len(bin_indices) bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distribution( y_pred, weights=sample_weight) for bin, bin_weight in enumerate(bin_weights): if bin_weight <= 0: continue bin_mask = bin_indices == bin local_distribution = y_pred[bin_mask] local_weights = sample_weight[bin_mask] result += bin_weight * _cvm_2samp_fast(global_data, local_distribution, global_weight, local_weights, global_F) return result
def bin_based_ks(y_pred, mask, sample_weight, bin_indices): """Kolmogorov-Smirnov flatness on bins""" assert len(y_pred) == len(sample_weight) == len(bin_indices) == len(mask) y_pred = y_pred[mask] sample_weight = sample_weight[mask] bin_indices = bin_indices[mask] bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight) prepared_data, prepared_weight, prep_F = prepare_distribution( y_pred, weights=sample_weight) result = 0. for bin, bin_weight in enumerate(bin_weights): if bin_weight <= 0: continue local_distribution = y_pred[bin_indices == bin] local_weights = sample_weight[bin_indices == bin] result += bin_weight * \ _ks_2samp_fast(prepared_data, local_distribution, prepared_weight, local_weights, prep_F) return result