def get_fit_model(score_list, label_list): p_train = np.array(score_list) y_train = np.array(label_list) ir = IR() ir.fit( p_train, y_train ) return ir
class HTMLTime(object): """ >>> htmlTime = HTMLTime(pathToIDX) >>> t = htmlTime(frameNumber) """ def __init__(self, idx): super(HTMLTime, self).__init__() self.idx = idx # load .idx file using pandas df = read_table( self.idx, sep='\s+', names=['frame_number', 'frame_type', 'bytes', 'seconds'] ) x = np.array(df['frame_number'], dtype=np.float) y = np.array(df['seconds'], dtype=np.float) # train isotonic regression self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y)) self.ir.fit(x, y) # frame number support self.xmin = np.min(x) self.xmax = np.max(x) def __call__(self, frameNumber): return self.ir.transform([min(self.xmax, max(self.xmin, frameNumber) )])[0]
def main(self): x_field = self.fields_by_key('x')[0] y_field = self.fields_by_key('y')[0] x = np.array(self.slice_data(x_field,int)) y = np.array(self.slice_data(y_field,int)) n = len(x) render = StringIO.StringIO() ############################################################################### # Fit IsotonicRegression and LinearRegression models ir = IsotonicRegression() y_ = ir.fit_transform(x, y) lr = LinearRegression() lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression ############################################################################### # plot result segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] lc = LineCollection(segments, zorder=0) lc.set_array(np.ones(len(y))) lc.set_linewidths(0.5 * np.ones(n)) fig = plt.figure() plt.plot(x, y, 'r.', markersize=12) plt.plot(x, y_, 'g.-', markersize=12) plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-') plt.gca().add_collection(lc) plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right') plt.title('Isotonic regression') plt.savefig(render,format='png') return render
def test_isotonic_regression_ties_secondary_(): """ Test isotonic regression fit, transform and fit_transform against the "secondary" ties method and "pituitary" data from R "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods Set values based on pituitary example and the following R command detailed in the paper above: > library("isotone") > data("pituitary") > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary") > res1$x `isotone` version: 1.0-2, 2014-09-07 R version: R version 3.1.1 (2014-07-10) """ x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14] y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25] y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 24.25, 24.25] # Check fit, transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true, 4) assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
def train_classifier_with_calib(classifier, data, use_all_data=False, normalize=False): X_train = data.X_train y_train = data.y_train X_cv = data.X_cv y_cv = data.y_cv if normalize: X_train, X_cv = normalize_data(X_train, X_cv) if not use_all_data: ir = IR() score, S = train(classifier, X_train, y_train, X_cv, y_cv, data.y_classes) predictions_proba = classifier.predict_proba(X_cv) proba = predictions_proba[:,1]; ir.fit_transform(proba,y_cv) print proba print ir return { 'classifier': classifier, 'score': score, 'S_auc': S, 'IR':ir, 'prange':[np.amin(proba),np.amax(proba)] } else: train_all_data(classifier, X_train, y_train, X_cv, y_cv) return { 'classifier': classifier }
def test_isotonic_duplicate_min_entry(): x = [0, 0, 1] y = [0, 0, 1] ir = IsotonicRegression(increasing=True, out_of_bounds="clip") ir.fit(x, y) all_predictions_finite = np.all(np.isfinite(ir.predict(x))) assert_true(all_predictions_finite)
def predict_probs(model, train_class, train_features, test_features, normalize_probs=None): """ Fit a given binary classification model to training sample features and return predicted probabilities for the positive class for the training and test samples. """ model.fit(train_features, train_class) train_prob, test_prob = [model.predict_proba(f)[:, 1] for f in (train_features, test_features)] if normalize_probs == "ROCSlope": # calibrate probabilities based on the estimated local slope # of the ROC curve chunk_size = 10 # number of instances for slope estimation n_train_pos = 301 # total number of positive (preictal) instances n_train_neg = 3766 # total negative (interictal) n_chunk_tot = 4000.0 / float(chunk_size) # estimated total in test data # sort training data classes by predicted probability sort_order = train_prob.argsort() p_sorted = train_prob[sort_order] c_sorted = train_class[sort_order] ix = np.array(range(len(train_prob))) # loop over chunks for i_ch in range(1 + (len(train_prob) - 1) / chunk_size): p_chunk, c_chunk = [ x[np.where((ix >= i_ch * chunk_size) & (ix < (i_ch + 1) * chunk_size))[0]] for x in (p_sorted, c_sorted) ] pmin = np.min(p_chunk) pmax = np.max(p_chunk) # compute TPR/FPR (relative to the entire training set) tpr = np.sum(c_chunk) / float(n_train_pos) fpr = np.sum(1 - c_chunk) / float(n_train_neg) # compute probability transformation for this chunk qc = (2.0 / np.pi) * np.arctan(tpr / (fpr + 1.0e-3 / float(n_train_neg))) qmin = np.max((0.0, qc - 0.5 / float(n_chunk_tot))) qmax = np.min((1.0, qc + 0.5 / float(n_chunk_tot))) # transform probabilities tr_p_ch = np.where((train_prob > pmin) & (train_prob <= pmax))[0] train_prob[tr_p_ch] = qmin + (train_prob[tr_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin) te_p_ch = np.where((test_prob > pmin) & (test_prob <= pmax))[0] test_prob[te_p_ch] = qmin + (test_prob[te_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin) elif normalize_probs == "LogShift": # shift probabilities in log(p/(1-p)) so that a fraction f_pre # of the samples has probability > 0.5, where f_pre is the # fraction of preictal samples in the training data f_pre = len(np.where(train_class)[0]) / float(len(train_class)) train_th, test_th = [sorted(p)[int((1.0 - f_pre) * len(p))] for p in (train_prob, test_prob)] train_prob, test_prob = [ (1.0 - pth) * p / (pth + p - 2.0 * pth * p) for (pth, p) in zip((train_th, test_th), (train_prob, test_prob)) ] elif normalize_probs == "IsoReg": # fit an isotonic regression model to training probabilities # and use the model to transform all probabilities prob_model = IsotonicRegression(out_of_bounds="clip") prob_model.fit(train_prob, train_class) train_prob, test_prob = [prob_model.transform(p) for p in (train_prob, test_prob)] elif normalize_probs is not None: sys.exit("Invalid value of normalize_probs:", str(normalize_probs)) return (train_prob, test_prob)
def test_isotonic_min_max_boundaries(): # check if min value is used correctly ir = IsotonicRegression(y_min=2, y_max=4) n = 6 x = np.arange(n) y = np.arange(n) y_test = [2, 2, 2, 3, 4, 4] y_result = np.round(ir.fit_transform(x, y)) assert_array_equal(y_result, y_test)
def _minCllr(self, targetScoreValues, nonTargetScoreValues, ): """ Computes the 'minimum cost of log likelihood ratio' measure as given in IDIAP's bob calibration.py We don't however use pavx here, as used in many other implementations, but sklearn's isotonic regression, which is equivalent and frees us from linking to c++ code. """ # First, sort both scores. neg = sorted(nonTargetScoreValues) pos = sorted(targetScoreValues) N = len(neg) P = len(pos) I = N + P # Now, iterate through both score sets and add a 0 for negative and 1 for positive scores. n, p = 0, 0 idealSequence = np.zeros(I) neg_indices = [0] * N pos_indices = [0] * P for i in range(I): if n == N or neg[n] > pos[p]: pos_indices[p] = i p += 1 idealSequence[i] = 1 else: neg_indices[n] = i n += 1 # Run the pool adjacent violaters method on the ideal LLR scores. # pavx implements isotonic regression. Python's sklearn contains code to do just that. ir = IsotonicRegression() # Calculate the isotonic regression. popt = ir.fit_transform(np.arange(len(idealSequence)), idealSequence) # disable runtime warnings for a short time since log(0) will raise a warning. old_warn_setup = np.seterr(divide='ignore') # ... compute logs. # Lets assume the prior odds on a target score is the ratio #target scores / #non target scores. log_prior_odds = math.log(float(P) / float(N)) posterior_log_odds = np.log(popt) - np.log(1.0 - popt) # ... activate old warnings. np.seterr(**old_warn_setup) llrs = posterior_log_odds - log_prior_odds # Unmix positive and negative scores. new_neg = np.zeros(N) for n in range(N): new_neg[n] = llrs[neg_indices[n]] new_pos = np.zeros(P) for p in range(P): new_pos[p] = llrs[pos_indices[p]] # Compute cllr of these new 'optimal' LLR scores. minCllr = self._cllr(new_pos, new_neg) return minCllr
def calibrate_row(row): calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[row])][row].values y = lab[~np.isnan(lab[row])]['labels'].values calibrator.fit(x, y) lab[row] = calibrator.predict(lab[row].values) amb[row] = calibrator.predict(amb[row].values) unl[row] = calibrator.predict(unl[row].values) scr[row] = calibrator.predict(scr[row].values)
def test_isotonic_sample_weight(): ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24] received_y = ir.fit_transform(x, y, sample_weight=sample_weight) assert_array_equal(expected_y, received_y)
def sklearn_isotonic_regression_multi(self, y, blocks): ir = IsotonicRegression() n = len(y) x = np.arange(n) z = np.zeros(n) z[:blocks[0]] = y[:blocks[0]] for start, end in zip(blocks, np.append(blocks[1:], [n])): z[start:end] = ir.fit_transform(x[start:end], y[start:end]) return z
def test_proj_PAV(self): n = 10 x = np.arange(n) rs = check_random_state(0) for i in range(10): y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() truth = ir.fit_transform(x, y) self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
def test_fast_predict(): # test that the faster prediction change doesn't # affect out-of-sample predictions: # https://github.com/scikit-learn/scikit-learn/pull/6206 rng = np.random.RandomState(123) n_samples = 10 ** 3 # X values over the -10,10 range X_train = 20.0 * rng.rand(n_samples) - 10 y_train = np.less(rng.rand(n_samples), expit(X_train)).astype('int64').astype('float64') weights = rng.rand(n_samples) # we also want to test that everything still works when some weights are 0 weights[rng.rand(n_samples) < 0.1] = 0 slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") # Build interpolation function with ALL input data, not just the # non-redundant subset. The following 2 lines are taken from the # .fit() method, without removing unnecessary points X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train, sample_weight=weights, trim_duplicates=False) slow_model._build_f(X_train_fit, y_train_fit) # fit with just the necessary data fast_model.fit(X_train, y_train, sample_weight=weights) X_test = 20.0 * rng.rand(n_samples) - 10 y_pred_slow = slow_model.predict(X_test) y_pred_fast = fast_model.predict(X_test) assert_array_equal(y_pred_slow, y_pred_fast)
def test_isotonic_regression_pickle(): y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="clip") ir.fit(x, y) ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL) ir2 = pickle.loads(ir_ser) np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
def test_isotonic_regression_oob_raise(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="raise") ir.fit(x, y) # Check that an exception is thrown assert_raises(ValueError, ir.predict, [min(x) - 10, max(x) + 10])
def test_isotonic_regression_oob_bad(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="xyz") ir.fit(x, y) # Make sure that we throw an error for bad out_of_bounds value assert_raises(ValueError, ir.predict, [min(x)-10, max(x)+10])
def test_isotonic_regression_oob_bad_after(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="raise") # Make sure that we throw an error for bad out_of_bounds value in transform ir.fit(x, y) ir.out_of_bounds = "xyz" assert_raises(ValueError, ir.transform, x)
def test_isotonic_regression_oob_nan(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="nan") ir.fit(x, y) # Predict from training and test x and check that we have two NaNs. y1 = ir.predict([min(x) - 10, max(x) + 10]) assert_equal(sum(np.isnan(y1)), 2)
def test_permutation_invariance(): # check that fit is permutation invariant. # regression test of missing sorting of sample-weights ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0) y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight) y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x) assert_array_equal(y_transformed, y_transformed_s)
def test_isotonic_regression(self): self.setUp() times = [] rs = check_random_state(0) for n in [int(1e1), int(1e2), int(1e3), int(1e4)]: x = np.arange(n) y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() start_time = time.time() y1 = ir.fit_transform(x, y) times.append(time.time() - start_time) print 'test isotonic_regression' print times
def test_isotonic_sample_weight_parameter_default_value(): # check if default value of sample_weight parameter is one ir = IsotonicRegression() # random test data rng = np.random.RandomState(42) n = 100 x = np.arange(n) y = rng.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) # check if value is correctly used weights = np.ones(n) y_set_value = ir.fit_transform(x, y, sample_weight=weights) y_default_value = ir.fit_transform(x, y) assert_array_equal(y_set_value, y_default_value)
def test_isotonic_regression_oob_clip(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="clip") ir.fit(x, y) # Predict from training and test x and check that min/max match. y1 = ir.predict([min(x) - 10, max(x) + 10]) y2 = ir.predict(x) assert_equal(max(y1), max(y2)) assert_equal(min(y1), min(y2))
def sklearn_pav(y_true, y_score): """ Binary PAV algorithm, algorithm to solve Isotonic regression NOTE: sklearn isotonic regression is used y_true: 1D array y_score: 1D array """ id_permute = np.argsort(y_score) y_sort = y_true[id_permute] p_sort = np.sort(y_score) ir = IsotonicRegression() p_calibrated = ir.fit_transform(p_sort, y_sort) return y_sort, p_calibrated
def ensure_monotone_increasing(arr_, fromright=True, fromleft=True, newmode=True): r""" Args: arr_ (ndarray): Returns: ndarray: arr CommandLine: python -m vtool.math --test-ensure_monotone_increasing --show Example: >>> # DISABLE_DOCTEST >>> from vtool.math import * # NOQA >>> rng = np.random.RandomState(0) >>> size_ = 100 >>> domain = np.arange(size_) >>> offset = ut.get_argval('--offset', type_=float, default=2.3) >>> arr_ = np.sin(np.pi * (domain / 100) - offset) + (rng.rand(len(domain)) - .5) * .1 >>> arr = ensure_monotone_increasing(arr_, fromleft=False, fromright=True) >>> result = str(arr) >>> print(result) >>> ut.quit_if_noshow() >>> import plottool as pt >>> pt.plot2(domain, arr_, 'r-', fnum=1, pnum=(2, 1, 1), title='before', equal_aspect=False) >>> pt.plot2(domain, arr, 'r-', fnum=1, pnum=(2, 1, 2), title='after monotonization (increasing)', equal_aspect=False) >>> ut.show_if_requested() """ if newmode: from sklearn.isotonic import IsotonicRegression ir = IsotonicRegression() arr = ir.fit_transform(np.arange(len(arr_)), arr_) else: arr = arr_.copy() size = len(arr) # Ensure increasing from right if fromright: for lx in range(1, size): rx = (size - lx - 1) if arr[rx] > arr[rx + 1]: arr[rx] = arr[rx + 1] if fromleft: # ensure increasing from left for lx in range(0, size - 1): if arr[lx] > arr[lx + 1]: arr[lx + 1] = arr[lx] return arr
def test_isotonic_regression_auto_increasing(): # Set y and x for decreasing y = np.array([5, 6.1, 6, 7, 10, 9, 10]) x = np.arange(len(y)) # Create model and fit_transform ir = IsotonicRegression(increasing='auto') with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") y_ = ir.fit_transform(x, y) # work-around for pearson divide warnings in scipy <= 0.17.0 assert_true(all(["invalid value encountered in " in str(warn.message) for warn in w])) # Check that relationship increases is_increasing = y_[0] < y_[-1] assert_true(is_increasing)
class LLRIsotonicRegression(LLR): """Log-likelihood ratio estimation by isotonic regression""" def __init__(self, equal_priors=False): super(LLRIsotonicRegression, self).__init__() self.equal_priors = equal_priors def fit(self, X, Y): self.prior = self._get_prior(X, Y) scores, ratios = self._get_scores_ratios(X, Y) y_min = np.min(ratios) y_max = np.max(ratios) self.ir = IsotonicRegression(y_min=y_min, y_max=y_max) self.ir.fit(scores, ratios) return self def toLogLikelihoodRatio(self, scores): """Get log-likelihood ratio given scores Parameters ---------- scores : numpy array Test scores Returns ------- llr : numpy array Log-likelihood ratio array with same shape as input `scores` """ x_min = np.min(self.ir.X_) x_max = np.max(self.ir.X_) oob_min = np.where(scores < x_min) oob_max = np.where(scores > x_max) ok = np.where((scores >= x_min) * (scores <= x_max)) calibrated = np.zeros(scores.shape) calibrated[ok] = self.ir.transform(scores[ok]) calibrated[oob_min] = self.ir.y_min calibrated[oob_max] = self.ir.y_max return calibrated
def compare_PAVA_implementations(): trials = 10 rs = check_random_state(0) times = [] dimensions = [int(1e1), int(1e2), int(1e3), int(1e4), int(1e5), int(1e6)] #dimensions = [int(1e6)] for n in dimensions: print 'dimensionality', n x = np.arange(n) for trial in range(trials): y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) # scikit-learn PAVA if n <= int(1e5): #if n <= int(1e6): ir = IsotonicRegression() y_copy = np.copy(y) start_time = time.time() ir.fit_transform(x, y_copy) time1 = time.time() - start_time else: time1 = -1. # in-place PAVA y_copy = np.copy(y) start_time = time.time() isotonic_regression_c_2(y_copy, 0, n) time2 = time.time() - start_time # in-place PAVA++ y_copy = np.copy(y) start_time = time.time() isotonic_regression_c(y_copy, 0, n) time3 = time.time() - start_time times.append([time1, time2, time3]) index = [] for n in ['1e1','1e2','1e3','1e4','1e5','1e6']: index += [n]*trials #for n in ['1e6']: index += [n]*trials tuples = zip() df = pd.DataFrame(times, index=index, columns=['sklearn', 'PAVA+', 'PAVA++']) print df df.save('results/PAVA_comparison_5.pkl')
def test_isotonic_dtype(): y = [2, 1, 4, 3, 5] weights = np.array([.9, .9, .9, .9, .9], dtype=np.float64) reg = IsotonicRegression() for dtype in (np.int32, np.int64, np.float32, np.float64): for sample_weight in (None, weights.astype(np.float32), weights): y_np = np.array(y, dtype=dtype) expected_dtype = \ check_array(y_np, dtype=[np.float64, np.float32], ensure_2d=False).dtype res = isotonic_regression(y_np, sample_weight=sample_weight) assert_equal(res.dtype, expected_dtype) X = np.arange(len(y)).astype(dtype) reg.fit(X, y_np, sample_weight=sample_weight) res = reg.predict(X) assert_equal(res.dtype, expected_dtype)
def plot(): results = [] for f in glob('umau_lengths*npz'): d = np.load(f) l = d['lengths'] l = l[~np.isnan(l)] l = l[np.isfinite(l)] l = l[l>0] results.append([d['mu'], l.mean()]) for f in glob('miller/lengths*npz'): d = np.load(f) if d['mu'] not in [r[0] for r in results]: l = d['lengths'] l = l[np.isfinite(l)] l = l[~np.isnan(l)] l = l[l>0] results.append([d['mu'], l.mean()]) else: idx = [r[0] for r in results].index(d['mu']) l = d['lengths'] l = l[np.isfinite(l)] l = l[~np.isnan(l)] l = l[l>0] results[idx][1] = 0.5 * (results[idx][1] + l.mean()) results = sorted(results) results = np.array(results).T muvals, mean_length = results f = plt.figure() f.clf() ax = f.gca() iso = IsotonicRegression(increasing=False) mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]), mean_length) ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU') ax.plot([muvals.min(), muvals.max()], [2*ndist.ppf(0.975)]*2, c='red', label='Sample splitting', linewidth=2) ax.plot([muvals.min(), muvals.max()], [np.sqrt(2)*ndist.ppf(0.975)]*2, 'k--') ax.set_xlabel(r'$\mu$', fontsize=20) ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20) ax.legend(loc='lower right') ax.set_ylim([0,4]) ax.set_xlim([-2,9]) f.savefig('figure_b_umau.pdf')
def test_fast_predict(): # test that the faster prediction change doesn't # affect out-of-sample predictions: # https://github.com/scikit-learn/scikit-learn/pull/6206 rng = np.random.RandomState(123) n_samples = 10**3 # X values over the -10,10 range X_train = 20.0 * rng.rand(n_samples) - 10 y_train = np.less(rng.rand(n_samples), expit(X_train)).astype('int64').astype('float64') weights = rng.rand(n_samples) # we also want to test that everything still works when some weights are 0 weights[rng.rand(n_samples) < 0.1] = 0 slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") # Build interpolation function with ALL input data, not just the # non-redundant subset. The following 2 lines are taken from the # .fit() method, without removing unnecessary points X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train, sample_weight=weights, trim_duplicates=False) slow_model._build_f(X_train_fit, y_train_fit) # fit with just the necessary data fast_model.fit(X_train, y_train, sample_weight=weights) X_test = 20.0 * rng.rand(n_samples) - 10 y_pred_slow = slow_model.predict(X_test) y_pred_fast = fast_model.predict(X_test) assert_array_equal(y_pred_slow, y_pred_fast)
def test_isotonic_regression_reversed(): y = np.array([10, 9, 10, 7, 6, 6.1, 5]) y_ = IsotonicRegression(increasing=False).fit_transform( np.arange(len(y)), y) assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
forward = lambda X, thetas: simulate_posterior_predictive(X, thetas, noise=0.5) # Construct the calibration dataset predicted_quantiles, empirical_quantiles = make_cal_dataset( y[:, np.newaxis], x, coefs, forward) # - plt.scatter(predicted_quantiles, empirical_quantiles) plt.plot([0, 1], [0, 1], color='tab:grey', linestyle='--') plt.xlabel('Predicted Cumulative Distribution') plt.ylabel('Empirical Cumulative Distribution') plt.title('Calibration Dataset') # + # Train isotonic regression in reverse mode ir = IsotonicRegression(out_of_bounds='clip') ir.fit(empirical_quantiles, predicted_quantiles) # Find the values of calibrated quantiles calibrated_quantiles = ir.predict([0.025, 0.5, 0.975]) # + # Plot the posterior predictive low, mid, high = np.percentile(posterior_predictive, [2.5, 50, 97.5], axis=1) plt.fill_between(x_test, low, high, alpha=0.2, label='95% Predictive Interval') plt.plot(x_test, mid, color='tab:red', label='Predicted Median') low, mid, high = np.quantile(posterior_predictive, calibrated_quantiles, axis=1) plt.fill_between(x_test, low, high, alpha=0.2, label='95% Calibrated Interval')
import numpy as np import matplotlib.pyplot as plt from sklearn.isotonic import IsotonicRegression from sklearn.utils import check_random_state print("Generating Data.") n = 100 # number of data points x = np.arange(n) # x values random_seed = check_random_state(0) y = random_seed.randint(-50, 50, size=(n, )) + 50. * np.log1p(np.arange(n)) # y values # Fit IsotonicRegression models print("Fitting model.") ir = IsotonicRegression() y_ = ir.fit_transform(x, y) # Plot result print("Displaying result.") fig = plt.figure() plt.plot(x, y, 'r.', markersize=12) plt.plot(x, y_, 'b.-', markersize=12) plt.legend(('Data', 'Isotonic Fit'), loc='upper left') plt.title('Isotonic regression') plt.show()
from sklearn.linear_model import LinearRegression from sklearn.isotonic import IsotonicRegression from sklearn.utils import check_random_state main = pd.read_csv( '/Users/Theo/Google Drive/College/Senior Thesis/Materials Science/data/isotonic/hasam_g.csv', sep=',', names=['Time', 'G']) mainx_data = main.Time[1:60] mainx_target = main.G[1:60] ############################################################################### # Fit Isotonic Regression model ############################################################################### ir = IsotonicRegression() lr = LinearRegression() y_ = ir.fit_transform(mainx_data, mainx_target) predictions = ir.predict([10]) print predictions print ir.score(mainx_data, mainx_target) #print("RSS: %.2f" # % np.mean((ir.predict(mainx_target) - mainy_target) ** 2)) ############################################################################### # Plot result ############################################################################### fig = plt.figure() plt.plot(mainx_data, mainx_target, 'r.', markersize=12)
def __init__(self, add_one=False): self.add_one = add_one self._ir = IsotonicRegression()
test_cases = [ (VotingClassifier([('logistic', LogisticRegression()), ('earth', Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]))], 'hard', weights=[1.01, 1.01]), ['predict'], create_weird_classification_problem_1()), (GradientBoostingClassifier(max_depth=10, n_estimators=10), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (LogisticRegression(), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (IsotonicRegression(out_of_bounds='clip'), ['predict'], create_isotonic_regression_problem_1()), (Earth(), ['predict', 'transform'], create_regression_problem_1()), (Earth(allow_missing=True), ['predict', 'transform'], create_regression_problem_with_missingness_1()), (ElasticNet(), ['predict'], create_regression_problem_1()), (ElasticNetCV(), ['predict'], create_regression_problem_1()), (LassoCV(), ['predict'], create_regression_problem_1()), (Ridge(), ['predict'], create_regression_problem_1()), (RidgeCV(), ['predict'], create_regression_problem_1()), (SGDRegressor(), ['predict'], create_regression_problem_1()), (Lasso(), ['predict'], create_regression_problem_1()), (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]), ['predict', 'predict_proba'], create_weird_classification_problem_1()), (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))], transformer_weights={
diabetes_X_test= [] diabetes_y_test= [] f = open('Datatrain.csv') for row in csv.reader(f): diabetes_X_train.append(float(row[3])) diabetes_y_train.append(float(row[4])) f.close() f = open('Datatest.csv') for row in csv.reader(f): diabetes_X_test.append(float(row[3])) diabetes_y_test.append(float(row[4])) f.close() ir = IsotonicRegression() y_ = ir.fit_transform(diabetes_X_train, diabetes_y_train) #lr = LinearRegression() #lr.fit(diabetes_X_train, diabetes_y_train) # x needs to be 2d for LinearRegression segments = [[[i, diabetes_y_train[i]], [i, y_[i]]] for i in range(len(diabetes_X_train))] lc = LineCollection(segments, zorder=0) lc.set_array(np.ones(len(diabetes_y_train))) lc.set_linewidths(0.5 * np.ones(len(diabetes_X_train))) fig = plt.figure() #plt.plot(diabetes_X_train, diabetes_y_train, 'r.', markersize=12,color='green') plt.plot(diabetes_X_test, diabetes_y_test, 'r.', markersize=12,color='black') #plt.plot(diabetes_X_train, y_, 'g.-', markersize=12,color='yellow') plt.plot(diabetes_X_test, ir.predict(diabetes_X_test), 'b-',color='red') #plt.gca().add_collection(lc)
def interpolation_estimate(Z, Z_constraint, lower=0.5, upper=4, npts=30, ndraw=5000, burnin=1000, estimator='truncated'): """ Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$ where $C$ is the convex set encoded by `Z_constraint` .. math:: C = \left\{z: Az+b \geq 0 \right\} with $(A,b)$ being `(Z_constraints.inequality, Z_constraints.inequality_offset)`. The algorithm proceeds by estimating $\|Z\|^2_2$ by Monte Carlo for a range of `npts` values starting from `lower*np.linalg.norm(Z)/np.sqrt(n)` to `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`. These values are then used to compute the GCM (Greated Convex Minorant) which is interpolated and solved for an arguments such that the expected value matches the observed value `(Z**2).sum()`. Parameters ---------- Z : `np.float` Observed data to be used to estimate $\sigma$. Should be in the cone specified by `Z_constraints`. Z_constraint : `constraints` Constraints under which we observe $Z$. lower : float Multiple of naive estimate to use as lower endpoint. upper : float Multiple of naive estimate to use as upper endpoint. npts : int Number of points in interpolation grid. ndraw : int Number of Gibbs steps to use for estimating each expectation. burnin : int How many Gibbs steps to use for burning in. Returns ------- sigma_hat : float The root of the interpolant derived from GCM values. interpolant : `interp1d` The interpolant, to be used for plotting or other diagnostics. WARNING ------- * It is assumed that `Z_constraints.equality` is `None`. * Uses `rpy2` and `fdrtool` library to compute the GCM. """ initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0]) Svalues = np.linspace(lower * initial, upper * initial, npts) Evalues = [] n = Z.shape[0] L, V, U, S = quadratic_bounds(Z, np.identity(n), Z_constraint) if estimator == 'truncated': def _estimator(S, Z, Z_constraint): L, V, U, _ = quadratic_bounds(Z, np.identity(n), Z_constraint) num = mpquad( lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 + (n - 1) * mplog( (x + L) / S) + 2 * mplog(x + L)), [0, U - L]) den = mpquad( lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 + (n - 1) * mplog((x + L) / S)), [0, U - L]) print num / den, V**2, S, (L, U) return num / den elif estimator == 'simulate': state = Z.copy() rpy.r.assign('state', state) def _estimator(S, state, Z_constraint): Z_constraint.covariance = S**2 * np.identity(Z.shape[0]) e, v, _state = expected_norm_squared(state, Z_constraint, ndraw=ndraw, burnin=burnin) state[:] = _state return e state = Z.copy() for S in Svalues: Evalues.append(_estimator(S, state, Z_constraint)) ir = IsotonicRegression() if DEBUG: print Svalues, Evalues Eiso = ir.fit_transform(Svalues, Evalues) Sinterp, Einterp = Svalues, Eiso # rpy.r.assign('S', Svalues) # rpy.r.assign('E', np.array(Evalues)) # rpy.r(''' # library(fdrtool); # G = gcmlcm(S, E, 'gcm'); # Sgcm = G$x.knots; # Egcm = G$y.knots; # ''') # Sgcm = np.asarray(rpy.r('Sgcm')) # Egcm = np.asarray(rpy.r('Egcm')) # interpolant = interp1d(Sgcm, Egcm - (Z**2).sum()) interpolant = interp1d(Sinterp, Einterp - (Z**2).sum()) try: sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max()) except: raise ValueError( '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max())) return sigma_hat, interpolant
def calculate_probability_distribution(tree , instances , index , cal_method =None): if cal_method == None : return tree.distribution_for_instance(instances.get_instance(index)) elif cal_method == 'Platt' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ (dist[1] - 0.5)*2.0 ] y_train[i] = [instance.get_value(instance.class_index)] # print("p_train ====>>>" , p_train) # print("y_train ====>>>" , y_train) dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0 tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : warnings.filterwarnings("ignore", category=FutureWarning) lr = LR(solver='lbfgs') lr.fit( p_train , np.ravel(y_train,order='C') ) return lr.predict_proba( tmp.reshape(1, -1))[0] elif cal_method == 'Isotonic' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ dist[1] ] y_train[i] = [instance.get_value(instance.class_index)] dist = tree.distribution_for_instance(instances.get_instance(index))[1] tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : ir = IR( out_of_bounds = 'clip' ) ir.fit(np.ravel(p_train,order='C') , np.ravel(y_train,order='C')) p = ir.transform( np.ravel(tmp,order='C'))[0] return [p,1-p] # elif cal_method == 'ProbabilityCalibrationTree' : # pass elif cal_method == 'ICP' : pass elif cal_method == 'Venn1' : calibrPts = [] for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) score = dist[0] if dist[1] < dist[0] else dist[1] calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) dist = (tree.distribution_for_instance(instances.get_instance(index))) score = dist[0] if dist[1] < dist[0] else dist[1] tmp = [score] p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp) print("Vennnnnn =========>>>>>>>>>>>> ", p0, " , ",p1) return [p0,p1] pass
class IsotonicCalibrator(BaseEstimator, RegressorMixin): """Probability calibration with isotonic regression. Note ---- This class backports and extends `sklearn.isotonic.IsotonicRegression`. """ def __init__(self, y_min=None, y_max=None, increasing=True, interpolation=False): """Constructor. Parameters ---------- * `y_min` [optional]: If not `None`, set the lowest value of the fit to `y_min`. * `y_max` [optional]: If not `None`, set the highest value of the fit to `y_max`. * `increasing` [boolean or string, default=`True`]: If boolean, whether or not to fit the isotonic regression with `y` increasing or decreasing. The string value `"auto"` determines whether `y` should increase or decrease based on the Spearman correlation estimate's sign. * `interpolation` [boolean, default=`False`]: Whether linear interpolation is enabled or not. """ self.y_min = y_min self.y_max = y_max self.increasing = increasing self.interpolation = interpolation def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. * `sample_weight` [array-like, shape=(n_samples,), optional]: Weights. If set to None, all weights will be set to 1. Returns ------- * `self` [object]: `self`. Notes ----- `T` is stored for future use, as `predict` needs T to interpolate new input data. """ # Check input T = column_or_1d(T) # Fit isotonic regression self.ir_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max, increasing=self.increasing, out_of_bounds="clip") self.ir_.fit(T, y, sample_weight=sample_weight) # Interpolators if self.interpolation: p = self.ir_.transform(T) change_mask1 = (p - np.roll(p, 1)) > 0 change_mask2 = np.roll(change_mask1, -1) change_mask1[0] = True change_mask1[-1] = True change_mask2[0] = True change_mask2[-1] = True self.interp1_ = interp1d(T[change_mask1], p[change_mask1], bounds_error=False, fill_value=(0., 1.)) self.interp2_ = interp1d(T[change_mask2], p[change_mask2], bounds_error=False, fill_value=(0., 1.)) return self def predict(self, T): """Calibrate data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Data to calibrate. Returns ------- * `Tt` [array, shape=(n_samples,)]: Calibrated data. """ if self.interpolation: T = column_or_1d(T) return 0.5 * (self.interp1_(T) + self.interp2_(T)) else: return self.ir_.transform(T)
def _smacof_single(dissimilarities1, dissimilarities2, p, weights1=None, weights2=None, metric=True, n_components=2, init1=None, init2=None, max_iter=300, verbose=0, eps=1e-3, random_state1=None, random_state2=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- dissimilarities : ndarray, shape (n_samples, n_samples) Pairwise dissimilarities between the points. Must be symmetric. metric : boolean, optional, default: True Compute metric or nonmetric SMACOF algorithm. n_components : int, optional, default: 2 Number of dimensions in which to immerse the dissimilarities. If an ``init`` array is provided, this option is overridden and the shape of ``init`` is used to determine the dimensionality of the embedding space. init : ndarray, shape (n_samples, n_components), optional, default: None Starting configuration of the embedding to initialize the algorithm. By default, the algorithm is initialized with a randomly chosen array. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. verbose : int, optional, default: 0 Level of verbosity. eps : float, optional, default: 1e-3 Relative tolerance with respect to stress at which to declare convergence. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X : ndarray, shape (n_samples, n_components) Coordinates of the points in a ``n_components``-space. stress : float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points). n_iter : int The number of iterations corresponding to the best stress. """ dissimilarities1 = check_symmetric(dissimilarities1, raise_exception=True) dissimilarities2 = check_symmetric(dissimilarities2, raise_exception=True) if dissimilarities1.shape != dissimilarities2.shape: print("Error. Distance matrices have different shapes.") sys.exit("Error. Distance matrices have different shapes.") n_samples = dissimilarities1.shape[0] X1, sim_flat1, sim_flat_w1 = initialize(dissimilarities1, random_state1, init1, n_samples, n_components) X2, sim_flat2, sim_flat_w2 = initialize(dissimilarities2, random_state2, init2, n_samples, n_components) #Default: equal weights if weights1 is None: weights1 = np.ones((n_samples, n_samples)) if weights2 is None: weights2 = np.ones(n_samples) # Disparity-specific weights (V in Borg) V1 = np.zeros((n_samples, n_samples)) for i in range(n_samples): diagonal = 0 for j in range(n_samples): V1[i, j] = -weights1[i, j] diagonal += weights1[i, j] V1[i, i] = diagonal # Locus-specific weights V2 = np.zeros((n_samples, n_samples)) for i, weight in enumerate(weights2): V2[i, i] = weight * p * n_samples inv_V = moore_penrose(V1 + V2) old_stress = None ir = IsotonicRegression() for it in range(max_iter): # Compute distance and monotonic regression dis1 = euclidean_distances(X1) dis2 = euclidean_distances(X2) if metric: disparities1 = dissimilarities1 disparities2 = dissimilarities2 else: disparities1 = nonmetric_disparities1(dis1, sim_flat1, n_samples) disparities2 = nonmetric_disparities2(dis2, sim_flat2, n_samples) # Compute stress stress = ((dis1.ravel() - disparities1.ravel())**2).sum() + ( (dis2.ravel() - disparities2.ravel())**2 ).sum() + n_samples * p * ssd( X1, X2 ) #multiply by n_samples to make ssd term comparable in magnitude to embedding error terms # Update X1 using the Guttman transform X1 = guttman(X1, X2, disparities1, inv_V, V2, dis1) # Update X2 using the Guttman transform X2 = guttman(X2, X1, disparities2, inv_V, V2, dis2) # Test stress dis1 = np.sqrt((X1**2).sum(axis=1)).sum() dis2 = np.sqrt((X2**2).sum(axis=1)).sum() dis = np.mean((dis1, dis2)) if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: if np.abs(old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) break old_stress = stress / dis return X1, X2, stress, it + 1
import matplotlib.pyplot as plt from matplotlib.collections import LineCollection from sklearn.linear_model import LinearRegression from sklearn.isotonic import IsotonicRegression from sklearn.utils import check_random_state n = 100 x = np.arange(n) rs = check_random_state(0) y = rs.randint(-50, 50, size=(n, )) + 50.0 * np.log1p(np.arange(n)) # %% # Fit IsotonicRegression and LinearRegression models: ir = IsotonicRegression(out_of_bounds="clip") y_ = ir.fit_transform(x, y) lr = LinearRegression() lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression # %% # Plot results: segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] lc = LineCollection(segments, zorder=0) lc.set_array(np.ones(len(y))) lc.set_linewidths(np.full(n, 0.5)) fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 6))
class _CalibratedClassifier: """Probability calibration with isotonic regression or sigmoid. It assumes that base_estimator has already been fit, and trains the calibration on the input set of the fit function. Note that this class should not be used as an estimator directly. Use CalibratedClassifierCV with cv="prefit" instead. Parameters ---------- base_estimator : instance BaseEstimator The classifier whose output decision function needs to be calibrated to offer more accurate predict_proba outputs. No default value since it has to be an already fitted estimator. method : 'sigmoid' | 'isotonic' The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's method or 'isotonic' which is a non-parametric approach based on isotonic regression. classes : array-like, shape (n_classes,), optional Contains unique classes used to fit the base estimator. if None, then classes is extracted from the given target values in fit(). See also -------- CalibratedClassifierCV References ---------- .. [1] Obtaining calibrated probability estimates from decision trees and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 .. [2] Transforming Classifier Scores into Accurate Multiclass Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods, J. Platt, (1999) .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ def __init__(self, base_estimator, method='isotonic', classes=None): self.base_estimator = base_estimator self.method = method self.classes = classes def _preproc(self, X): n_classes = len(self.classes_) probabilities = self.base_estimator.predict_proba(X)[:, 1] idx_pos_class = self.label_encoder_.\ transform(self.base_estimator.classes_) return probabilities, idx_pos_class def fit(self, X, y): """Calibrate the fitted model Parameters ---------- X : array-lie, shape (n_samples,) Predictions from the base_estimator y : array-like, shape (n_samples,) Target values. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Returns ------- self : object Returns an instance of self. """ self.label_encoder_ = LabelEncoder() if self.classes is None: self.label_encoder_.fit(y) else: self.label_encoder_.fit(self.classes) self.classes_ = self.label_encoder_.classes_ self.calibrator_ = IsotonicRegression(out_of_bounds='clip') self.calibrator_.fit(X, y) return self def predict_proba(self, X): """Posterior probabilities of classification This function returns posterior probabilities of classification according to each class on an array of test vectors X. Parameters ---------- X : array-like, shape (n_samples, n_features) The samples. Returns ------- C : array, shape (n_samples, n_classes) The predicted probas. Can be exact zeros. """ n_classes = len(self.classes_) proba = np.zeros((X.shape[0], n_classes)) probabilities, idx_pos_class = self._preproc(X) proba[:, 1] = self.calibrator_.predict(probabilities) # Normalize the probabilities if n_classes == 2: proba[:, 0] = 1. - proba[:, 1] else: proba /= np.sum(proba, axis=1)[:, np.newaxis] # XXX : for some reason all probas can be 0 proba[np.isnan(proba)] = 1. / n_classes # Deal with cases where the predicted probability minimally exceeds 1.0 proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0 return proba
def truncated_estimate(Z, Z_constraint, lower=0.5, upper=2, npts=15): """ Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$ where $C$ is the convex set encoded by `Z_constraints` .. math:: C = \left\{z: Az+b \geq 0 \right\} with $(A,b)$ being `(Z_constraints.inequality, Z_constraints.inequality_offset)`. The algorithm proceeds by estimating $\|Z\|^2_2$ by Monte Carlo for a range of `npts` values starting from `lower*np.linalg.norm(Z)/np.sqrt(n)` to `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`. These values are then used to compute the GCM (Greated Convex Minorant) which is interpolated and solved for an arguments such that the expected value matches the observed value `(Z**2).sum()`. Parameters ---------- Z : `np.float` Observed data to be used to estimate $\sigma$. Should be in the cone specified by `Z_constraints`. Z_constraint : `constraints` Constraints under which we observe $Z$. lower : float Multiple of naive estimate to use as lower endpoint. upper : float Multiple of naive estimate to use as upper endpoint. npts : int Number of points in interpolation grid. Returns ------- sigma_hat : float The root of the interpolant derived from GCM values. interpolant : `interp1d` The interpolant, to be used for plotting or other diagnostics. WARNING ------- * It is assumed that `Z_constraints.equality` is `None`. * Uses `rpy2` and `fdrtool` library to compute the GCM. """ initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0]) Svalues = np.linspace(lower * initial, upper * initial, npts) Evalues = [] # use truncated chi to estimate integral # with scipy.integrate.quad n = Z.shape[0] operator = np.identity(n) L, V, U, S = quadratic_bounds(Z, operator, Z_constraint) for S in Svalues: num = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n + 1) * np.log(x)), L, U) den = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n - 1) * np.log(x)), L, U) Evalues.append(num[0] / den[0]) print num, den ir = IsotonicRegression() if DEBUG: print Svalues, Evalues Eiso = ir.fit_transform(Svalues, Evalues) Sinterp, Einterp = Svalues, Eiso interpolant = interp1d(Sinterp, Einterp - (Z**2).sum()) try: sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max()) except: raise ValueError( '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max())) return sigma_hat, interpolant print L, V, U, S
def known_iso(self, axis=1, unknowns=0): # performs isotonic regression ONLY for known data values # and ONLY on columns where there are non-increasing points # row-wise (axis = 0) or column-wise (axis = 1) # unknowns should be 0 or none tonic = copy.deepcopy(self.array) # returns a new isotonic matrix known_dict = self.known_for_iso(axis, unknowns) if axis == 1: increase_dict, non_increase_percent = self.is_col_inc() else: increase_dict = self.is_row_inc() # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc()) if axis == 1: for i in range(len(tonic[0])): try: # if i is a key in increase dict then this column needs regression # else just pass to the next column tester = increase_dict[i] X = known_dict[i] if X != []: initial_vals = [tonic[j][i] for j in X] # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic))) # put everything back: for row in range(len(predictions)): tonic[row][i] = predictions[row] except: pass else: # same thing but with rows for i in range(len(tonic)): try: tester = increase_dict[i] X = known_dict[i] if X != []: initial_vals = [tonic[i][j] for j in X] # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic[i]))) # put everything back: tonic[i] = predictions except: pass newframe = pd.DataFrame(tonic) newframe.columns = self.dataframe.columns newframe.index = self.dataframe.index if unknowns == 0: # Isotonic outputs NaN values, replace them with zeros newframe = newframe.fillna(0) return mat_opr(newframe)
def test_isotonic_copy_before_fit(): # https://github.com/scikit-learn/scikit-learn/issues/6628 ir = IsotonicRegression() copy.copy(ir)
def __init__(self): self.clf = IsotonicRegression(y_min=0.0, y_max=1.0, out_of_bounds='clip')
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, resolution, min_dist, max_dist, verbose): if verbose: print("\nFit a univariate spline to the probability means\n"), print( "------------------------------------------------------------------------------------\n" ), # maximum residual allowed for spline is set to min(y)^2 splineError = min(y)**2 # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance min_x, max_x = min(x), max(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if min_x <= i <= max_x: splineX.append(i) splineY = ius(splineX) ir = IsotonicRegression(increasing=False) rNewSplineY = ir.fit_transform(splineX, splineY) newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) ### Now newSplineY holds the monotonic contact probabilities residual = sum([i * i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.title( 'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual), size='small') plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y], 'ro', label="Means") plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY], 'g-', label="Spline fit") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min_x / 1000.0, max_x / 1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.loglog(x, y, 'r.') # Data plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') plt.xlim([min_x, max_x]) plt.savefig(outfilename + '.res' + str(resolution) + '.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 if verbose: print("lower bound on mid-range distances " + repr(min_dist) + ", upper bound on mid-range distances " + repr(max_dist) + "\n"), with gzip.open(infilename, 'r') as infile: with gzip.open( '{}.res{}.significances.txt.gz'.format(outfilename, resolution), 'w') as outfile: outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) for line in infile: chr1, mid1, chr2, mid2, contactCount = line.rstrip().split() mid1, mid2, contactCount = int(mid1), int(mid2), int( contactCount) distance = mid2 - mid1 bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and mid1 in biasDic[chr1]: bias1 = biasDic[chr1][mid1] if chr2 in biasDic and mid2 in biasDic[chr2]: bias2 = biasDic[chr2][mid2] if min_dist <= distance <= max_dist: # make sure the interaction distance is covered by the probability bins distToLookUp = min(max(distance, min_x), max_x) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture p_val = scsp.bdtrc(contactCount - 1, observedIntraInRangeSum, prior_p) if p_val <= 1: outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( chr1, mid1, chr2, mid2, contactCount, p_val, -1)) return splineX, newSplineY, residual
'severe_wind' : 'wnd_probs_>40_prob_max' } iterator = itertools.product(time_set, target_set,) for combo in iterator: time, target = combo print(f'Loading {time} data...') fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_matched_to_{target}_0km_dataset.pkl') data = io.load_dataframe(fname=fname, target_vars=['matched_to_tornado_0km', 'matched_to_severe_hail_0km','matched_to_severe_wind_0km' ], vars_to_drop=target_vars ) examples = data['examples'] baseline_probs = baseline_var[target] forecast_probabilities = examples[baseline_probs] target_values = data[f'matched_to_{target}_0km'] iso_reg = IsotonicRegression(out_of_bounds='clip') iso_reg.fit(forecast_probabilities, target_values) save_fname = f'calibration_model_wofs_{time}_{target}_{baseline_probs}.joblib' joblib.dump(iso_reg, join(config.ML_MODEL_SAVE_PATH, save_fname))
class Forecaster(nn.Module): def __init__(self, args): super(Forecaster, self).__init__() self.args = args def eval_all(self, bx, by): br = torch.rand(bx.shape[0], 1, device=bx.device) mean, stddev = self.forward(bx=bx, br=br) cdf = 0.5 * (1.0 + torch.erf((by - mean) / stddev / math.sqrt(2))) loss_cdf = torch.abs(cdf - br).mean() eps = 1e-5 loss_cdf_kl = cdf * (torch.log(cdf + eps) - torch.log(br + eps)) + \ (1 - cdf) * (torch.log(1 - cdf + eps) - torch.log(1 - br + eps)) loss_cdf_kl = loss_cdf_kl.mean() loss_stddev = stddev.mean() # loss_l2 = ((by - mean) ** 2).mean() # Log likelihood of by under the predicted Gaussian distribution loss_nll = torch.log(stddev) + math.log(2 * math.pi) / 2.0 + (( (by - mean) / stddev)**2 / 2.0) loss_nll = loss_nll.mean() return cdf, loss_cdf * ( 1 - self.args.klcoeff ) + loss_cdf_kl * self.args.klcoeff, loss_stddev, loss_nll def eval_in_batch(self, bx, by, batch_size): pass def recalibrate(self, bx, by): with torch.no_grad(): cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float) cdf = np.sort(cdf) lin = np.linspace(0, 1, int(cdf.shape[0])) # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6) cdf = np.insert(np.insert(cdf, -1, 1), 0, 0) lin = np.insert(np.insert(lin, -1, 1), 0, 0) self.iso_transform = IsotonicRegression() self.iso_transform.fit_transform(cdf, lin) def apply_recalibrate(self, cdf): if self.iso_transform is not None: # If input tensor output tensor # If input numpy array output numpy array is_torch = False if isinstance(cdf, type(torch.zeros(1))): device = cdf.get_device() cdf = cdf.cpu().numpy() is_torch = True original_shape = cdf.shape new_cdf = np.reshape(self.iso_transform.transform(cdf.flatten()), original_shape) if is_torch: new_cdf = torch.from_numpy(new_cdf).to(device) return new_cdf else: return cdf
def _smacof_with_anchors_single(config, similarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- config : Config object configuration object for anchor-tag deployment parameters similarities: symmetric ndarray, shape [n * n] similarities between the points metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overwritten if initial array is provided. init: {None or ndarray}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose: int, optional, default: 0 level of verbosity eps: float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X: ndarray (n_samples, n_components), float coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) n_iter : int Number of iterations run last_positions: ndarray [X1,...,Xn] An array of computed Xs. """ NO_OF_TAGS, NO_OF_ANCHORS = config.no_of_tags, config.no_of_anchors similarities = check_symmetric(similarities, raise_exception=True) n_samples = similarities.shape[0] random_state = check_random_state(random_state) sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration X = random_state.rand(n_samples * n_components) X = X.reshape((n_samples, n_components)) # uncomment the following if weight matrix W is not hollow #X[:-2] = Xa else: # overrides the parameter p n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % (n_samples, n_components)) X = init old_stress = None ir = IsotonicRegression() # setup weight matrix weights = np.ones((n_samples, n_samples)) if getattr(config, 'missingdata', None): weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = 0 diag = np.arange(n_samples) weights[diag, diag] = 0 last_n_configs = [] Xa = config.anchors for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) if metric: disparities = similarities else: dis_flat = dis.ravel() # similarities with 0 are considered as missing values dis_flat_w = dis_flat[sim_flat != 0] # Compute the disparities using a monotonic regression disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) disparities = dis_flat.copy() disparities[sim_flat != 0] = disparities_flat disparities = disparities.reshape((n_samples, n_samples)) disparities *= np.sqrt( (n_samples * (n_samples - 1) / 2) / (disparities**2).sum()) # Compute stress stress = (weights.ravel() * (dis.ravel() - disparities.ravel())**2).sum() / 2 #stress = ((dis[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel() - disparities[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel()) ** 2).sum() # Update X using the Guttman transform dis[dis == 0] = 1e5 ratio = weights * disparities / dis B = -ratio B[diag, diag] = 0 B[diag, diag] = -B.sum(axis=1) # Apply update to only tag configuration since anchor config is already known V = -weights V[diag, diag] += weights.sum(axis=1) # V_inv = np.linalg.pinv(V) V12 = V[-NO_OF_TAGS:, :-NO_OF_TAGS] B11 = B[-NO_OF_TAGS:, -NO_OF_TAGS:] Zu = X[-NO_OF_TAGS:] B12 = B[-NO_OF_TAGS:, :-NO_OF_TAGS] V11_inv = np.linalg.inv(V[-NO_OF_TAGS:, -NO_OF_TAGS:]) Xu = V11_inv.dot(B11.dot(Zu) + (B12 - V12).dot(Xa)) # merge known anchors config with new tags config X = np.concatenate((Xa, Xu)) last_n_configs.append(X) #X = (1/n_samples)*B.dot(X) #dis = np.sqrt((X ** 2).sum(axis=1)).sum() dis = (weights * dis**2).sum() / 2 if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: if (old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) break old_stress = stress / dis return X, stress, it + 1, np.array(last_n_configs)
linear_regression.fit(visual_vector, conceptual_vector) predictions = linear_regression.predict(visual_vector) r2_linear = r2_score(conceptual_vector, predictions) print("R² linear visual to conceptual:", r2_linear) # compute least squares regression for R² metric: conceptual to visual linear_regression = LinearRegression() linear_regression.fit(conceptual_vector, visual_vector) predictions = linear_regression.predict(conceptual_vector) r2_linear = r2_score(visual_vector, predictions) print("R² linear conceptual to visual:", r2_linear) # compute isotonic regression for R² metric: visual to conceptual x = np.reshape(visual_dissimilarities, (-1)) y = np.reshape(conceptual_dissimilarities, (-1)) isotonic_regression = IsotonicRegression() predictions = isotonic_regression.fit_transform(x, y) r2_isotonic = r2_score(y, predictions) print("R² isotonic visual to conceptual:", r2_isotonic) # compute isotonic regression for R² metric: visual to conceptual x = np.reshape(conceptual_dissimilarities, (-1)) y = np.reshape(visual_dissimilarities, (-1)) isotonic_regression = IsotonicRegression() predictions = isotonic_regression.fit_transform(x, y) r2_isotonic = r2_score(y, predictions) print("R² isotonic conceptual to visual:", r2_isotonic) if args.plot: # create scatter plot if user want us to fig, ax = plt.subplots(figsize=(12, 12))
from sklearn.isotonic import IsotonicRegression import matplotlib.pyplot as plt # the reported preference orderings x = list(range(1, 7)) # the estimated preference orderings according to the additive model (16.1) and # the metric solution (Table 16.6) in MVA y = [0.84, 2.84, 3.16, 3.34, 5.66, 5.16] gp = IsotonicRegression() y_gp = gp.fit_transform(x, y) fig, ax = plt.subplots(figsize=(7, 7)) ax.plot(x, y_gp, c="k") ax.scatter(x, y, c="r") for i in range(0, len(y)): ax.text(x[i] - 0.05, y_gp[i] + 0.1, "car" + str(i + 1), fontsize=14) plt.xlabel("revealed rankings", fontsize=14) plt.ylabel("estimated rankings", fontsize=14) plt.title("Car rankings", fontsize=16) plt.show()
def find_regions(t, x, minimum_datapoints=10, mu_factor=0.7, low_density_factor=0.01): """Finds clean regions of gradual growth between jump events. Args: t (1D numpy.array): clean time points x (1D numpy.array): cleaned log-OD series (same shape as `t`) minimum_datapoints (int): regions must have at least this many data points mu_factor (float): the linear fit is tempered by this factor before isotonic regression Low values (0.0..0.5) can result in missing jump events. High values (0.8..1.0) can result in oversegmentation, i.e. false jump events Returns: numpy.array: list of start indexes for the regions numpy.array: list of end indexes (inclusive) for the regions """ # find gaps in the data avg_dt = (t[-1] - t[0]) / (len(t) - 1) gap_start_idexes = np.where(np.diff(t) > avg_dt / low_density_factor)[0] # build initial set of regions from these gaps s_raw = [0] e_raw = [] for gap_idx in gap_start_idexes: e_raw.append(gap_idx) s_raw.append(gap_idx + 1) e_raw.append(len(t) - 1) regions_to_investigate = list(zip(s_raw, e_raw)) s = [] e = [] while len(regions_to_investigate) > 0: # pick a new region start_idx, end_idx = regions_to_investigate.pop() # check that there are at least a minimum number of datapoints if end_idx - start_idx + 1 < minimum_datapoints: continue # find optimal drift t_region = t[start_idx:end_idx + 1] x_region = x[start_idx:end_idx + 1] mu_min = LinearRegression(fit_intercept=True) \ .fit(t_region.reshape([-1, 1]), x_region) \ .coef_ # fit monotonic function x_drifting = x_region - t_region * mu_min * mu_factor iso_reg = IsotonicRegression(increasing=False) \ .fit(t_region, x_drifting) x_segmented = iso_reg.predict(t_region) # find jumps jump_indexes = np.where(np.diff(x_segmented) < 0)[0] + start_idx if len(jump_indexes) > 0: # if found, add the sub-regions to the list of new regions start_indexes = [start_idx] end_indexes = [] for jump_idx in jump_indexes: end_indexes.append(jump_idx) start_indexes.append(jump_idx + 1) end_indexes.append(end_idx) for start_idx, end_idx in zip(start_indexes, end_indexes): regions_to_investigate.append((start_idx, end_idx)) else: # if no subregions are found, add regions to final set s.append(start_idx) e.append(end_idx) s.sort() e.sort() return np.array(s), np.array(e)
# Test img_test, truth_mask_test, predicted_mask_test = img, mask, model.predict( img) test_batch_size = tf.shape(img).numpy()[0] else: logging.warning("Skipping some data!!!") break # Flatten the base model predictions and true values predicted_mask_train_arr = flatten_tensor(predicted_mask_train) truth_mask_train_arr = flatten_tensor(truth_mask_train) predicted_mask_test_arr = flatten_tensor(predicted_mask_test) truth_mask_test_arr = flatten_tensor(truth_mask_test) iso_regression = IsotonicRegression(out_of_bounds='clip') iso_regression.fit(predicted_mask_train_arr, truth_mask_train_arr) p_calibrated = iso_regression.predict(predicted_mask_test_arr) calibration_model_fn = os.path.join(FLAGS.output, 'calibration.weights') dump(iso_regression, calibration_model_fn) plot_calibration_curve(truth_mask_test_arr, p_calibrated, output=FLAGS.output) # Convert 1-d ndarray to tensor of 3 channel p_calibrated = np.reshape( p_calibrated, [test_batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL]) metrics = calculate_metrics(truth_mask_test, predicted_mask_test, p_calibrated)
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write("------------------------------------------------------------------------------------\n"), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])] x.sort() for i in range(1,len(x)): if x[i]<=x[i-1]: print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.") print("Avg. distance of bin(i-1)... %s" % x[i-1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX=max(x) tempMinX=min(x) tempList=sorted([dis for dis in mainDic]) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i<=tempMaxX: splineX.append(i) splineY=ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX,splineY) #print(newSplineY) residual =sum([i*i for i in (y - ius(x))]) if visual==True: xi = np.linspace(min(x),max(x),5*len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2) plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres>0 and distUpThres<float("inf"): plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb)) plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres>0 and distUpThres<float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename+'.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 discardCount=0 p_vals=[] q_vals=[] biasl=[] biasr=[] for line in infile: ch1,mid1,ch2,mid2,contactCount=line.rstrip().split() contactCount = float(contactCount) interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1); mid2 = int(mid2) interactionType = interxn.getType(distLowThres,distUpThres) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 in biasDic and mid1 in biasDic[ch1]: bias1=biasDic[ch1][mid1] if ch2 in biasDic and mid2 in biasDic[ch2]: bias2=biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1<0 or bias2<0) and interactionType !='inter': prior_p=1.0 p_val=1.0 discardCount+=1 elif interactionType=='intraInRange' and not interOnly: distToLookUp=max(interxn.getDistance(),min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p) intraInRangeCount +=1 elif interactionType =='intraShort' and not interOnly: prior_p=1.0 p_val=1.0 intraVeryProximalCount += 1 elif interactionType =='intraLong' and not interOnly: prior_p=1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val=1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p=interChrProb*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p) interCount += 1 else: p_val=1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount) q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0/possibleInterAllCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0/possibleIntraInRangeCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile =gzip.open(infilename, 'rt') if resolution: outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt') else: outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n") count=0 for line in infile: words=line.rstrip().split() chr1=words[0] midPoint1=int(words[1]) chr2=words[2] midPoint2=int(words[3]) interactionCount=float(words[4]) p_val=p_vals[count] q_val=q_vals[count] bias1=biasl[count] bias2=biasr[count] if (allReg or interOnly) and chr1!=chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1==chr2: interactionDistance = abs(midPoint1-midPoint2) if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val<outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1-midPoint2)) count+=1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR=0.0 maxFDR=0.05 increment=0.001 FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
def train_models(orig_vector_prediction_matrix, orig_scalar_prediction_matrix, vector_target_matrix, scalar_target_matrix, separate_by_height=True): """Trains isotonic-regression models. E = number of examples H = number of heights T_v = number of vector target variables T_s = number of scalar target variables :param orig_vector_prediction_matrix: numpy array (E x H x T_v) of predicted values for vector target variables. :param orig_scalar_prediction_matrix: numpy array (E x T_s) of predicted values for scalar target variables. :param vector_target_matrix: numpy array (E x H x T_v) of actual values for vector target variables. :param scalar_target_matrix: numpy array (E x T_s) of actual values for scalar target variables. :param separate_by_height: Boolean flag. If True, will train one model for each target variable (channel). If False, will train one model for each pair of target variable and height. :return: scalar_model_objects: List (length T_s) of models (instances of `sklearn.isotonic.IsotonicRegression`) for scalar target variables. :return: vector_model_object_matrix: numpy array (H x T_v) of models (instances of `sklearn.isotonic.IsotonicRegression`) for vector target variables. If `separate_by_height == True`, this array is H x T_v. If `separate_by_height == False`, this array has length T_v. """ # Check input args. num_examples = None num_heights = 0 num_vector_targets = 0 num_scalar_targets = 0 have_vectors = (orig_vector_prediction_matrix is not None or vector_target_matrix is not None) if have_vectors: error_checking.assert_is_numpy_array(orig_vector_prediction_matrix, num_dimensions=3) error_checking.assert_is_numpy_array_without_nan( orig_vector_prediction_matrix) error_checking.assert_is_numpy_array( vector_target_matrix, exact_dimensions=numpy.array(orig_vector_prediction_matrix.shape, dtype=int)) error_checking.assert_is_numpy_array_without_nan(vector_target_matrix) num_examples = vector_target_matrix.shape[0] num_heights = vector_target_matrix.shape[1] num_vector_targets = vector_target_matrix.shape[2] have_scalars = (orig_scalar_prediction_matrix is not None or scalar_target_matrix is not None) if have_scalars: error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix, num_dimensions=2) if num_examples is None: num_examples = orig_scalar_prediction_matrix.shape[0] expected_dim = numpy.array( [num_examples, orig_scalar_prediction_matrix.shape[1]], dtype=int) error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix, exact_dimensions=expected_dim) error_checking.assert_is_numpy_array_without_nan( orig_scalar_prediction_matrix) error_checking.assert_is_numpy_array( scalar_target_matrix, exact_dimensions=numpy.array(orig_scalar_prediction_matrix.shape, dtype=int)) error_checking.assert_is_numpy_array_without_nan(scalar_target_matrix) num_scalar_targets = scalar_target_matrix.shape[1] error_checking.assert_is_boolean(separate_by_height) # Do actual stuff. scalar_model_objects = [None] * num_scalar_targets num_modeling_heights = num_heights if separate_by_height else 1 vector_model_object_matrix = numpy.full( (num_modeling_heights, num_vector_targets), '', dtype=object) for k in range(num_scalar_targets): print( ('Training isotonic-regression model for {0:d}th of {1:d} scalar ' 'target variables...').format(k + 1, num_scalar_targets)) scalar_model_objects[k] = IsotonicRegression(increasing=True, out_of_bounds='clip') scalar_model_objects[k].fit(X=orig_scalar_prediction_matrix[:, k], y=scalar_target_matrix[:, k]) if num_scalar_targets > 0: print('\n') for k in range(num_vector_targets): for j in range(num_modeling_heights): print(( 'Training isotonic-regression model for {0:d}th of {1:d} vector' ' target variables at {2:d}th of {3:d} modeling heights...' ).format(k + 1, num_vector_targets, j + 1, num_modeling_heights)) vector_model_object_matrix[j, k] = IsotonicRegression( increasing=True, out_of_bounds='clip') if separate_by_height: vector_model_object_matrix[j, k].fit( X=orig_vector_prediction_matrix[:, j, k], y=vector_target_matrix[:, j, k]) else: vector_model_object_matrix[j, k].fit( X=numpy.ravel(orig_vector_prediction_matrix[..., k]), y=numpy.ravel(vector_target_matrix[..., k])) if k != num_vector_targets - 1: print('\n') return scalar_model_objects, vector_model_object_matrix
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def iso(self, axis=1, unk='No'): # performs isotonic regression row-wise (axis = 0) or column-wise (axis = 1) tonic = copy.deepcopy(self.array) # returns a new isotonic matrix # either use a value for unknowns or just do isotonic with all present values if unk == 0 or unk is None: known_dict = self.known_for_iso(axis, unk) else: known_dict = None # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc()) if axis == 1: if known_dict is None: for i in range(len(tonic[0])): initial_vals = [tonic[j][i] for j in range(len(tonic))] X = list(range(len(initial_vals))) # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic))) # put everything back: for row in range(len(predictions)): tonic[row][i] = predictions[row] else: for i in range(len(tonic[0])): X = known_dict[i] initial_vals = [tonic[j][i] for j in X] # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic))) # put everything back: for row in range(len(predictions)): tonic[row][i] = predictions[row] else: if known_dict is None: for i in range(len(tonic)): initial_vals = [tonic[i][j] for j in range(len(tonic[0]))] X = list(range(len(initial_vals))) # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic))) # put everything back: tonic[i] = predictions else: for i in range(len(tonic)): X = known_dict[i] initial_vals = [tonic[i][j] for j in X] # Use the initial values to fit the model and then predict what the decreasing ones should be iso = IsotonicRegression(out_of_bounds='clip').fit( X, initial_vals) predictions = iso.predict(range(len(tonic))) # put everything back: tonic[i] = predictions newframe = pd.DataFrame(tonic) newframe.columns = self.dataframe.columns newframe.index = self.dataframe.index return mat_opr(newframe)