Exemplo n.º 1
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    y = np.array([10, 0, 2])
    y_ = np.array([4, 4, 4])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])

    # check we don't crash when all x are equal:
    ir = IsotonicRegression()
    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
Exemplo n.º 2
0
def train_classifier_with_calib(classifier, data, use_all_data=False, normalize=False):
    X_train = data.X_train
    y_train = data.y_train
    X_cv = data.X_cv
    y_cv = data.y_cv
    if normalize:
        X_train, X_cv = normalize_data(X_train, X_cv)
    if not use_all_data:
        ir = IR()
        score, S = train(classifier, X_train, y_train, X_cv, y_cv, data.y_classes)
        predictions_proba = classifier.predict_proba(X_cv)
        proba = predictions_proba[:,1];
        ir.fit_transform(proba,y_cv)
        print proba
        print ir
        return {
            'classifier': classifier,
            'score': score,
            'S_auc': S,
            'IR':ir,
            'prange':[np.amin(proba),np.amax(proba)]
        }
    else:
        train_all_data(classifier, X_train, y_train, X_cv, y_cv)
        return {
            'classifier': classifier
        }
Exemplo n.º 3
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    y = np.array([10, 0, 2])
    y_ = np.array([4, 4, 4])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])

    # check we don't crash when all x are equal:
    ir = IsotonicRegression()
    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
Exemplo n.º 4
0
def test_isotonic_regression_ties_max():
    # Setup examples with ties on maximum
    x = [1, 2, 3, 4, 5, 5]
    y = [1, 2, 3, 4, 5, 6]
    y_true = [1, 2, 3, 4, 5.5, 5.5]

    # Check that we get identical results for fit/transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(y_true, ir.fit_transform(x, y))
Exemplo n.º 5
0
def test_isotonic_regression_ties_max():
    # Setup examples with ties on maximum
    x = [1, 2, 3, 4, 5, 5]
    y = [1, 2, 3, 4, 5, 6]
    y_true = [1, 2, 3, 4, 5.5, 5.5]

    # Check that we get identical results for fit/transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(y_true, ir.fit_transform(x, y))
Exemplo n.º 6
0
def test_isotonic_sample_weight_parameter_default_value():
    # check if default value of sample_weight parameter is one
    ir = IsotonicRegression()
    # random test data
    rng = np.random.RandomState(42)
    n = 100
    x = np.arange(n)
    y = rng.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
    # check if value is correctly used
    weights = np.ones(n)
    y_set_value = ir.fit_transform(x, y, sample_weight=weights)
    y_default_value = ir.fit_transform(x, y)

    assert_array_equal(y_set_value, y_default_value)
Exemplo n.º 7
0
def test_isotonic_sample_weight_parameter_default_value():
    # check if default value of sample_weight parameter is one
    ir = IsotonicRegression()
    # random test data
    rng = np.random.RandomState(42)
    n = 100
    x = np.arange(n)
    y = rng.randint(-50, 50, size=(n, )) + 50. * np.log(1 + np.arange(n))
    # check if value is correctly used
    weights = np.ones(n)
    y_set_value = ir.fit_transform(x, y, sample_weight=weights)
    y_default_value = ir.fit_transform(x, y)

    assert_array_equal(y_set_value, y_default_value)
Exemplo n.º 8
0
    def fit_iso_transform(self, bx, by):
        with torch.no_grad():
            cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float)

        cdf = np.sort(cdf)
        lin = np.linspace(0, 1, int(cdf.shape[0]))

        # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability
        cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6)
        cdf = np.insert(np.insert(cdf, -1, 1), 0, 0)
        lin = np.insert(np.insert(lin, -1, 1), 0, 0)

        iso_transform = IsotonicRegression()
        iso_transform.fit_transform(cdf, lin)
        return iso_transform
Exemplo n.º 9
0
def plot():

    results = []
    for f in glob('lengths*npz'):
        d = np.load(f)
        l = d['lengths']
        l = l[l > 0.]
        print d['mu'], l.shape
        results.append([d['mu'], l.mean()])

    results = sorted(results)
    results = np.array(results).T
    muvals, mean_length = results
    f = plt.figure()
    f.clf()
    ax = f.gca()
    iso = IsotonicRegression(increasing=False)
    mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]),
                                        mean_length)
    ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU')
    ax.plot([muvals.min(), muvals.max()], [2 * ndist.ppf(0.975)] * 2,
            c='red',
            label='Sample splitting',
            linewidth=2)
    ax.plot([muvals.min(), muvals.max()], [np.sqrt(2) * ndist.ppf(0.975)] * 2,
            'k--')
    ax.set_xlabel(r'$\mu$', fontsize=20)
    ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20)
    ax.legend(loc='lower right')
    ax.set_ylim([0, 4])
    ax.set_xlim([-2, 9])
    f.savefig('figure_b.pdf')
    output = np.array(zip(muvals, mean_length))
    np.savetxt('equal_tailed_lengths.csv', output, delimiter=',')
def apply_isotonic(arr):
    n_krn = len(gus_krn)
    arr_smt = np.convolve(np.hstack([np.repeat(arr[0], n_krn * 2), arr]), gus_krn, mode='same')[n_krn * 2:]
    # plt.plot(range(len(arr_smt)), arr_smt, alpha=0.5, label='Smooth')
    ir = IsotonicRegression(y_min=0, increasing=False)
    # plt.plot(range(len(arr_smt)), bin_prd, alpha=0.5, label='Pred')
    return ir.fit_transform(range(len(arr_smt)), arr_smt)
Exemplo n.º 11
0
def _fit_isotonic(model,train_loader):

    t_start = perf_counter()
    means,stds,ys = model.mc_prediction_loader(train_loader)
    N = means.shape[0]    

    dist  = Normal(means,stds)
    cdf   = dist.cdf(ys)
    sorted_cdf,ind = cdf.sort()  #[N]
    y  = torch.arange(1.0,N+1)/N #[N]
    

    ir = IsotonicRegression(out_of_bounds='clip')
    x  =  sorted_cdf.cpu().numpy() #[N]
    y  =  y.numpy() #[N]

    x_app = np.insert(x,0,0.0)
    y_app = np.insert(y,0,0.0)
    y_ = ir.fit_transform(x_app, y_app)#[N]
    delta = _delta(means,stds,ys)

    #for synchronizing cuda calls
    torch.cuda.synchronize()
    #stop and measure the time taken for postprocessing method
    t_stop = perf_counter()
    iso_time = torch.tensor(t_stop - t_start)
    return ir,delta,sorted_cdf,iso_time
	def main(self):
		x_field = self.fields_by_key('x')[0]
		y_field = self.fields_by_key('y')[0]	
		x = np.array(self.slice_data(x_field,int))
		y = np.array(self.slice_data(y_field,int))
		n = len(x)
		render = StringIO.StringIO()
		
		###############################################################################
		# Fit IsotonicRegression and LinearRegression models

		ir = IsotonicRegression()

		y_ = ir.fit_transform(x, y)

		lr = LinearRegression()
		lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

		###############################################################################
		# plot result

		segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
		lc = LineCollection(segments, zorder=0)
		lc.set_array(np.ones(len(y)))
		lc.set_linewidths(0.5 * np.ones(n))

		fig = plt.figure()
		plt.plot(x, y, 'r.', markersize=12)
		plt.plot(x, y_, 'g.-', markersize=12)
		plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
		plt.gca().add_collection(lc)
		plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
		plt.title('Isotonic regression')
		plt.savefig(render,format='png')
		return render
Exemplo n.º 13
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
Exemplo n.º 14
0
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot):
    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        frq.append(readFreq(name))

    pav_classes = []

    for f in range(len(frq)):
        print(names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if plot:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(
        pav_classes,
        "../data/movies/finetune/" + file_name + "PavTermFrequency.txt")
    return pav_classes
Exemplo n.º 15
0
def cali(fname, predict_name, out_name, mode='ctr'):
    if mode == 'ctr':
        true_col = 'actual_click'
        prob_col = 'ctr'
    if mode == 'cvr':
        true_col = 'actual_purchase'
        prob_col = 'cvr'
    pred_df = pd.read_csv(predict_name, names=columns)
    nn = pred_df.shape[0]
    df = pd.read_csv(fname, names=columns)
    n = df.shape[0]
    y_true = df[true_col].values
    y_prob = df[prob_col].values
    #fraction_of_positives, mean_predicted_value = cali.calibration_curve(y_true, y_prob, normalize=False, n_bins=10)
    #plt.figure()
    #plt.plot(mean_predicted_value,fraction_of_positives)
    #plt.show()
    #plt.close()
    ir = IsotonicRegression()
    y = ir.fit_transform(y_prob, y_true)
    y_pred = ir.predict(pred_df[prob_col].values)
    nn = y_pred.shape[0]
    h = open(out_name, 'w')
    for i in range(nn):
        if i < nn - 1:
            h.write(str(y_pred[i]) + '\n')
        else:
            h.write(str(y_pred[i]))
    h.close()
Exemplo n.º 16
0
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
              22.22222, 22.22222, 22.22222, 24.25, 24.25]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Exemplo n.º 17
0
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [
        22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
        22.22222, 22.22222, 24.25, 24.25
    ]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Exemplo n.º 18
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
Exemplo n.º 19
0
def isoreg(filename):
    ds = pd.read_csv(filename, names=["1", "2"], skiprows=1)
    ds["1"] = pd.to_datetime(ds["1"], format="%Y-%m")    
    ds["d"] = (ds["1"] - ds["1"].min())  / np.timedelta64(1,'D')

    X = ds["d"]  # put your dates in here
    y = ds["2"]  # put your kwh in here

    model =  IsotonicRegression()
    model.fit_transform(X, y)

    X_predict = ds["d"]  # put the dates of which you want to predict kwh here
    y_predict = model.predict(X_predict)
    fig = plt.figure(figsize=(12, 6))
    plt.plot(y)
    plt.plot(y_predict)
    fig.savefig("files/" + str(os.path.splitext(os.path.basename(filename))[0]) + "_isoreg.png")
def compare_PAVA_implementations():
    trials = 10
    rs = check_random_state(0)
    times = []
    dimensions = [int(1e1), int(1e2), int(1e3), int(1e4), int(1e5), int(1e6)]
    #dimensions = [int(1e6)]

    for n in dimensions:
        print 'dimensionality', n
        x = np.arange(n)
        for trial in range(trials):

            y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))

            # scikit-learn PAVA
            if n <= int(1e5):
            #if n <= int(1e6):
                ir = IsotonicRegression()
                y_copy = np.copy(y)
                start_time = time.time()
                ir.fit_transform(x, y_copy)
                time1 = time.time() - start_time
            else: time1 = -1.

            # in-place PAVA
            y_copy = np.copy(y)
            start_time = time.time()
            isotonic_regression_c_2(y_copy, 0, n)
            time2 = time.time() - start_time

            # in-place PAVA++
            y_copy = np.copy(y)
            start_time = time.time()
            isotonic_regression_c(y_copy, 0, n)
            time3 = time.time() - start_time

            times.append([time1, time2, time3])

    index = []
    for n in ['1e1','1e2','1e3','1e4','1e5','1e6']: index += [n]*trials
    #for n in ['1e6']: index += [n]*trials  
    tuples = zip()
    df = pd.DataFrame(times, index=index, columns=['sklearn', 'PAVA+', 'PAVA++'])
    print df
    df.save('results/PAVA_comparison_5.pkl')
def mir_calibrate(logit,label,logit_eval):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] 
    p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None]
    ir = IsotonicRegression(out_of_bounds='clip')
    y_ = ir.fit_transform(p.flatten(), (label.flatten()))
    yt_ = ir.predict(p_eval.flatten())
    
    p = yt_.reshape(logit_eval.shape)+1e-9*p_eval
    return p
 def sklearn_isotonic_regression_multi(self, y, blocks):
     ir = IsotonicRegression()
     n = len(y)
     x = np.arange(n)
     z = np.zeros(n)
     z[:blocks[0]] = y[:blocks[0]]
     for start, end in zip(blocks, np.append(blocks[1:], [n])):
         z[start:end] = ir.fit_transform(x[start:end], y[start:end])
     return z
Exemplo n.º 23
0
def generate_calibration_model(df, pred_col, actual_col):
    ir = IsotonicRegression()
    y_ = ir.fit_transform(df[pred_col], df[actual_col])
    calib = {}
    calib['method'] = 'ir'
    calib['mod'] = ir
    calib['max_obs'] = max(df[pred_col])
    calib['max_cal'] = max(y_)
    return calib
 def test_proj_PAV(self):
     n = 10
     x = np.arange(n)
     rs = check_random_state(0)
     for i in range(10):
         y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         truth = ir.fit_transform(x, y)
         self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
Exemplo n.º 25
0
def _apply_isotonic_regression(df, mag, magErr):
    df.sort_values(by=[mag], inplace=True)
    df = df.reset_index(drop=True)
    x = df[mag]
    y = df[magErr]
    ir = IsotonicRegression()
    y_expected = ir.fit_transform(x, y)

    return ir, x, y, y_expected
Exemplo n.º 26
0
def test_isotonic_min_max_boundaries():
    # check if min value is used correctly
    ir = IsotonicRegression(y_min=2, y_max=4)
    n = 6
    x = np.arange(n)
    y = np.arange(n)
    y_test = [2, 2, 2, 3, 4, 4]
    y_result = np.round(ir.fit_transform(x, y))
    assert_array_equal(y_result, y_test)
 def sklearn_isotonic_regression_multi(self, y, blocks):
     ir = IsotonicRegression()
     n = len(y)
     x = np.arange(n)
     z = np.zeros(n)
     z[:blocks[0]] = y[:blocks[0]]
     for start, end in zip(blocks, np.append(blocks[1:], [n])):
         z[start:end] = ir.fit_transform(x[start:end], y[start:end])
     return z
Exemplo n.º 28
0
def test_isotonic_min_max_boundaries():
    # check if min value is used correctly
    ir = IsotonicRegression(y_min=2, y_max=4)
    n = 6
    x = np.arange(n)
    y = np.arange(n)
    y_test = [2, 2, 2, 3, 4, 4]
    y_result = np.round(ir.fit_transform(x, y))
    assert_array_equal(y_result, y_test)
Exemplo n.º 29
0
def test_isotonic_sample_weight():
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
    received_y = ir.fit_transform(x, y, sample_weight=sample_weight)

    assert_array_equal(expected_y, received_y)
Exemplo n.º 30
0
def test_isotonic_sample_weight():
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
    received_y = ir.fit_transform(x, y, sample_weight=sample_weight)

    assert_array_equal(expected_y, received_y)
Exemplo n.º 31
0
def compute_correlations(vectors, dissimilarities, distance_function):
    """
    Computes the correlation between vector distances and actual dissimilarities,
    using the given distance function between the vectors.
    
    Returns a dictionary from correlation metric to its corresponding value. 
    For convenience, this dictionary also contains both the vector of target dissimilarities
    and the vector of predicted similarities.
    """
    import numpy as np
    from sklearn.isotonic import IsotonicRegression
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    from scipy.stats import pearsonr, spearmanr, kendalltau

    # initialize dissimilarities with ones (arbitrary, will be overwritten anyways)
    dissimilarity_scores = np.ones(dissimilarities.shape)

    for i in range(len(vectors)):
        for j in range(len(vectors)):

            vec_i = vectors[i]
            vec_j = vectors[j]
            score = distance_function(vec_i, vec_j)[0][0]
            dissimilarity_scores[i][j] = score

    # transform dissimilarity matrices into vectors for correlation computation
    target_vector = np.reshape(dissimilarities, (-1, 1))
    sim_vector = np.reshape(dissimilarity_scores, (-1, 1))

    # compute correlations
    pearson, _ = pearsonr(sim_vector, target_vector)
    spearman, _ = spearmanr(sim_vector, target_vector)
    kendall, _ = kendalltau(sim_vector, target_vector)

    # compute least squares regression for R² metric
    linear_regression = LinearRegression()
    linear_regression.fit(sim_vector, target_vector)
    predictions = linear_regression.predict(sim_vector)
    r2_linear = r2_score(target_vector, predictions)

    # compute isotonic regression for R² metric
    x = np.reshape(dissimilarity_scores, (-1))
    y = np.reshape(dissimilarities, (-1))
    isotonic_regression = IsotonicRegression()
    predictions = isotonic_regression.fit_transform(x, y)
    r2_isotonic = r2_score(y, predictions)

    return {
        'pearson': pearson[0],
        'spearman': spearman,
        'kendall': kendall,
        'r2_linear': r2_linear,
        'r2_isotonic': r2_isotonic,
        'targets': target_vector,
        'predictions': sim_vector
    }
Exemplo n.º 32
0
    def _minCllr(self, targetScoreValues, nonTargetScoreValues, ):
        """
            Computes the 'minimum cost of log likelihood ratio' measure as given in IDIAP's bob calibration.py
            We don't however use pavx here, as used in many other implementations, but sklearn's isotonic regression,
            which is equivalent and frees us from linking to c++ code.
        """
        # First, sort both scores.
        neg = sorted(nonTargetScoreValues)
        pos = sorted(targetScoreValues)
        N = len(neg)
        P = len(pos)
        I = N + P
        # Now, iterate through both score sets and add a 0 for negative and 1 for positive scores.
        n, p = 0, 0
        idealSequence = np.zeros(I)
        neg_indices = [0] * N
        pos_indices = [0] * P
        for i in range(I):
            if n == N or neg[n] > pos[p]:
                pos_indices[p] = i
                p += 1
                idealSequence[i] = 1
            else:
                neg_indices[n] = i
                n += 1

        # Run the pool adjacent violaters method on the ideal LLR scores.
        # pavx implements isotonic regression. Python's sklearn contains code to do just that.
        ir = IsotonicRegression()
        # Calculate the isotonic regression.
        popt = ir.fit_transform(np.arange(len(idealSequence)), idealSequence)

        # disable runtime warnings for a short time since log(0) will raise a warning.
        old_warn_setup = np.seterr(divide='ignore')
        # ... compute logs.

        # Lets assume the prior odds on a target score is the ratio #target scores / #non target scores.
        log_prior_odds = math.log(float(P) / float(N))

        posterior_log_odds = np.log(popt) - np.log(1.0 - popt)

        # ... activate old warnings.
        np.seterr(**old_warn_setup)

        llrs = posterior_log_odds - log_prior_odds

        # Unmix positive and negative scores.
        new_neg = np.zeros(N)
        for n in range(N):
            new_neg[n] = llrs[neg_indices[n]]
        new_pos = np.zeros(P)
        for p in range(P):
            new_pos[p] = llrs[pos_indices[p]]

        # Compute cllr of these new 'optimal' LLR scores.
        minCllr = self._cllr(new_pos, new_neg)
        return minCllr
 def test_proj_PAV(self):
     n = 10
     x = np.arange(n)
     rs = check_random_state(0)
     for i in range(10):
         y = rs.randint(-50, 50,
                        size=(n, )) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         truth = ir.fit_transform(x, y)
         self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
Exemplo n.º 34
0
def regression_monotone_initial(data_training,data_target,data):
	regression_data = []
	#Realisation de la regression monotone qui nous donnes un bruit de fond théorique
	#On fit et transform les données pour chacun des replicats indépendamment
	ir = IsotonicRegression()
	for i in range(len(data_target)-1):
		regression = ir.fit_transform(data_training[0:len(data_training),1],data_target[i+1,0:len(data)])
		regression_data.append(regression)
	regression_data = np.asarray(regression_data)
	return regression_data
def irova_calibrate(logit,label,logit_eval):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] 
    p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None]
    

    for ii in range(p_eval.shape[1]):
        ir = IsotonicRegression(out_of_bounds='clip')
        y_ = ir.fit_transform(p[:,ii], label[:,ii])
        p_eval[:,ii] = ir.predict(p_eval[:,ii])+1e-9*p_eval[:,ii]
    return p_eval
    return p_eval
Exemplo n.º 36
0
def pavPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            limit_entities=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    pav_classes = []

    for f in range(len(frq)):
        try:
            print(names[f])
            x = np.asarray(frq[f])
            y = ranking[f]

            ir = IsotonicRegression()
            y_ = ir.fit_transform(x, y)
            pav_classes.append(y_)
            if do_p:
                plot(x, y, y_)
        except ValueError:
            print(names[f], "len ppmi",
                  len(frq[f], "len ranking", len(ranking[f])))
            exit()
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
Exemplo n.º 37
0
def cal(refn, out_fn, base_folder='data/round2models', example_folder_name='example_data'):
    """
    :param refn:
    :param out_fn:
    :param base_folder:
    :return:
    """

    from sklearn.isotonic import IsotonicRegression
    from sklearn.metrics import log_loss, roc_auc_score
    import os

    calpath = 'calibration/data/' + out_fn + '_caldata.p'

    if os.path.exists(calpath):
        try:
            with open(calpath, 'rb')as f:
                ldirs, pcal, mags = pickle.load(f)
            return ldirs, pcal, mags
        except:
            with open(calpath, 'rb')as f:
                ldirs, pcal = pickle.load(f)
            return ldirs, pcal

    mags = []
    y = []

    dirs = os.listdir(path=base_folder)
    for dir in dirs:
        adv_path = os.path.join(base_folder, dir, example_folder_name, refn)
        if os.path.exists(adv_path):
            mag = get_blur_mag(adv_path, sigma=2.0)
            truth_fn = os.path.join(base_folder, dir, 'config.json')
            cls = utils.get_class(truth_fn, classtype='binary', file=True)
            mags.append(mag)
            y.append(cls)


    ir_model = IsotonicRegression(out_of_bounds='clip')
    pcal = ir_model.fit_transform(mags, y)
    kld = log_loss(y, pcal)
    # print(kld)
    roc1 = roc_auc_score(y, np.array(pcal))
    print(out_fn, 'AUC:', roc1, 'KLD:', kld)

    # dump(ir_model, 'data/classifiers/blur' + '_ir.joblib')
    dump(ir_model, 'calibration/fitted/' + out_fn)
    pcal = pcal[np.argsort(dirs)]
    dirs.sort()
    with open(calpath,'wb') as f:
        pickle.dump([dirs, pcal, mags], f)

    return dirs, pcal, mags
Exemplo n.º 38
0
def test_permutation_invariance():
    # check that fit is permuation invariant.
    # regression test of missing sorting of sample-weights
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
    y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
    y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)

    assert_array_equal(y_transformed, y_transformed_s)
Exemplo n.º 39
0
def test_permutation_invariance():
    # check that fit is permutation invariant.
    # regression test of missing sorting of sample-weights
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
    y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
    y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)

    assert_array_equal(y_transformed, y_transformed_s)
 def test_isotonic_regression(self):
     self.setUp()
     times = []
     rs = check_random_state(0)
     for n in [int(1e1), int(1e2), int(1e3), int(1e4)]:
         x = np.arange(n)
         y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         start_time = time.time()
         y1 = ir.fit_transform(x, y)
         times.append(time.time() - start_time)
     print 'test isotonic_regression'
     print times
Exemplo n.º 41
0
def regression_monotone(data_target,data):
	regression_data = []
	
	#Realisation de la regression monotone qui nous donnes un bruit de fond théorique
	#On fit et transform les données pour chacun des replicats indépendamment
	ir = IsotonicRegression()
	#data_target_transpose = np.transpose(data_target)
	for replicat in data_target:
		regression = ir.fit_transform(np.arange(0,len(replicat),1),replicat)
		regression_data.append(regression)
	regression_data = np.asarray(regression_data)
	#print(len(regression_data))
	return regression_data
Exemplo n.º 42
0
 def test_isotonic_regression(self):
     self.setUp()
     times = []
     rs = check_random_state(0)
     for n in [int(1e1), int(1e2), int(1e3), int(1e4)]:
         x = np.arange(n)
         y = rs.randint(-50, 50,
                        size=(n, )) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         start_time = time.time()
         y1 = ir.fit_transform(x, y)
         times.append(time.time() - start_time)
     print 'test isotonic_regression'
     print times
Exemplo n.º 43
0
 def fit(self, counts_matrix, lengths=None):
     '''
       NMDS fit Function, scale low dimension matrix to high dimension
     
       Parameters
       ----------
       counts_matrix: ndarray
       
       Returns
       ----------
       fit_matrix: ndarray
       '''
     if not sparse.isspmatrix_coo(counts_matrix):
         counts_matrix = sparse.coo_matrix(counts_matrix)
         for i in range(self.max_iter_outer):
             if i == 0:
                 fit_matrix = Multi_Dimensional_Scaling_Base.estimate_model(
                     counts_matrix,
                     alpha=self.alpha,
                     beta=self.beta,
                     ini=self.init,
                     verbose=self.verbose,
                     precompute_distances=self.precompute_distances,
                     use_zero_entries=False,
                     random_state=self.random_state,
                     bias=self.bias,
                     factr=self.factr,
                     maxiter=self.max_iter)
             else:
                 ir = IsotonicRegression()
                 distances = np.sqrt(
                     ((fit_matrix[counts_matrix.row] -
                       fit_matrix[counts_matrix.col])**2).sum(axis=1))
                 wish_distances = ir.fit_transform(1. / counts_matrix.data,
                                                   distances)
                 fit_matrix = Multi_Dimensional_Scaling_Base.estimate_model(
                     sparse.coo_matrix(
                         (wish_distances, (counts_matrix.row,
                                           counts_matrix.col))),
                     alpha=self.alpha,
                     beta=self.beta,
                     ini=fit_matrix,
                     verbose=self.verbose,
                     use_zero_entries=False,
                     precompute_distances='precomputed',
                     random_state=self.random_state,
                     factr=self.factr,
                     maxiter=self.max_iter,
                 )
         return fit_matrix
Exemplo n.º 44
0
def isotonic_regression(ax, x, y, w=[]):
    """
    INPUT:
        ax: an Axes object
        x: (N, ) np.array
        y: (N, ) np.array
        w: None or a list of length N.
    OUTPUT:
        ax: an Axes object
    """

    if len(w) == 0:
        w = [1.0 for _ in y]
    n = len(y)

    # Fit IsotonicRegression and LinearRegression models
    ir = IsotonicRegression()
    y_ = ir.fit_transform(x, y, sample_weight=w)

    lr = LinearRegression()
    lr.fit(x[:, np.newaxis], y,
           sample_weight=w)  # x needs to be 2d for LinearRegression

    # Plot result
    segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]

    ax.plot(x, y, 'r.', markersize=12, alpha=0.2)
    ax.plot(x, y_, 'g^', markersize=12, alpha=0.2)
    ax.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
    ax.set_xlim(-0.1, 1.1)
    ax.set_ylim(-0.1, 1.1)

    # compute ece and acc after calibration
    ece = EceEval(np.array([1 - y_, y_]).T, y, num_bins=20)
    y_predict = y_ > 0.5
    acc = (y_predict == y).mean()

    ax.text(0.05,
            0.8,
            'ECE=%.4f\nACC=%.4f' % (ece, acc),
            size=14,
            ha='left',
            va='center',
            bbox={
                'facecolor': 'green',
                'alpha': 0.5,
                'pad': 4
            })

    return ax
Exemplo n.º 45
0
def sklearn_pav(y_true, y_score):
    """
    Binary PAV algorithm, algorithm to solve Isotonic regression
    NOTE: sklearn isotonic regression is used
    y_true: 1D array
    y_score: 1D array
    """
    id_permute = np.argsort(y_score)
    y_sort = y_true[id_permute]
    p_sort = np.sort(y_score)

    ir = IsotonicRegression()
    p_calibrated = ir.fit_transform(p_sort, y_sort)
    return y_sort, p_calibrated
Exemplo n.º 46
0
def ensure_monotone_increasing(arr_, fromright=True, fromleft=True, newmode=True):
    r"""
    Args:
        arr_ (ndarray):

    Returns:
        ndarray: arr

    CommandLine:
        python -m vtool.math --test-ensure_monotone_increasing --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from vtool.math import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> size_ = 100
        >>> domain = np.arange(size_)
        >>> offset = ut.get_argval('--offset', type_=float, default=2.3)
        >>> arr_ = np.sin(np.pi * (domain / 100) - offset) + (rng.rand(len(domain)) - .5) * .1
        >>> arr = ensure_monotone_increasing(arr_, fromleft=False, fromright=True)
        >>> result = str(arr)
        >>> print(result)
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> pt.plot2(domain, arr_, 'r-', fnum=1, pnum=(2, 1, 1), title='before', equal_aspect=False)
        >>> pt.plot2(domain, arr, 'r-', fnum=1, pnum=(2, 1, 2), title='after monotonization (increasing)', equal_aspect=False)
        >>> ut.show_if_requested()
    """
    if newmode:
        from sklearn.isotonic import IsotonicRegression
        ir = IsotonicRegression()
        arr = ir.fit_transform(np.arange(len(arr_)), arr_)
    else:
        arr = arr_.copy()
        size = len(arr)
        # Ensure increasing from right
        if fromright:
            for lx in range(1, size):
                rx = (size - lx - 1)
                if arr[rx] > arr[rx + 1]:
                    arr[rx] = arr[rx + 1]
        if fromleft:
            # ensure increasing from left
            for lx in range(0, size - 1):
                if arr[lx] > arr[lx + 1]:
                    arr[lx + 1] = arr[lx]
    return arr
Exemplo n.º 47
0
def test_isotonic_regression_auto_increasing():
    # Set y and x for decreasing
    y = np.array([5, 6.1, 6, 7, 10, 9, 10])
    x = np.arange(len(y))

    # Create model and fit_transform
    ir = IsotonicRegression(increasing='auto')
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        y_ = ir.fit_transform(x, y)
        # work-around for pearson divide warnings in scipy <= 0.17.0
        assert_true(all(["invalid value encountered in "
                         in str(warn.message) for warn in w]))

    # Check that relationship increases
    is_increasing = y_[0] < y_[-1]
    assert_true(is_increasing)
Exemplo n.º 48
0
def plot():

    results = []
    for f in glob('umau_lengths*npz'):
        d = np.load(f)
        l = d['lengths']
        l = l[~np.isnan(l)]
        l = l[np.isfinite(l)]
        l = l[l>0]
        results.append([d['mu'], l.mean()])
    for f in glob('miller/lengths*npz'):
        d = np.load(f)
        if d['mu'] not in [r[0] for r in results]:
            l = d['lengths']
            l = l[np.isfinite(l)]
            l = l[~np.isnan(l)]
            l = l[l>0]
            results.append([d['mu'], l.mean()])
        else:
            idx = [r[0] for r in results].index(d['mu'])
            l = d['lengths']
            l = l[np.isfinite(l)]
            l = l[~np.isnan(l)]
            l = l[l>0]
            results[idx][1] = 0.5 * (results[idx][1] + l.mean())
    results = sorted(results)
    results = np.array(results).T
    muvals, mean_length = results
    f = plt.figure()
    f.clf()
    ax = f.gca()
    iso = IsotonicRegression(increasing=False)
    mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]), mean_length)    
    ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU')
    ax.plot([muvals.min(), muvals.max()], [2*ndist.ppf(0.975)]*2, c='red', label='Sample splitting', linewidth=2)
    ax.plot([muvals.min(), muvals.max()], [np.sqrt(2)*ndist.ppf(0.975)]*2, 'k--')
    ax.set_xlabel(r'$\mu$', fontsize=20)
    ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20)
    ax.legend(loc='lower right')
    ax.set_ylim([0,4])
    ax.set_xlim([-2,9])
    f.savefig('figure_b_umau.pdf')
Exemplo n.º 49
0
Arquivo: mds.py Projeto: hiclib/pastis
    def fit(self, counts, lengths=None):
        """

        """
        if not sparse.isspmatrix_coo(counts):
            counts = sparse.coo_matrix(counts)

        for i in range(self.max_iter_outer):
            if i == 0:
                X = estimate_X(
                    counts,
                    alpha=self.alpha,
                    beta=self.beta,
                    ini=self.init,
                    verbose=self.verbose,
                    use_zero_entries=False,
                    random_state=self.random_state,
                    bias=self.bias,
                    factr=self.factr,
                    maxiter=self.max_iter,
                )
            else:
                ir = IsotonicRegression()
                dis = np.sqrt(((X[counts.row] - X[counts.col]) ** 2).sum(axis=1))
                wish_distances = ir.fit_transform(1.0 / counts.data, dis)
                X = estimate_X(
                    sparse.coo_matrix((wish_distances, (counts.row, counts.col))),
                    alpha=self.alpha,
                    beta=self.beta,
                    ini=X,
                    verbose=self.verbose,
                    use_zero_entries=False,
                    precompute_distances="precomputed",
                    random_state=self.random_state,
                    bias=self.bias,
                    factr=self.factr,
                    maxiter=self.max_iter,
                )
        print "writing wish distances"
        return X
Exemplo n.º 50
0
def isotonic_deg_sequence(deg_seq, eps):

    # index-sort deg_seq  
    idx = [i[0] for i in sorted(enumerate(deg_seq), key=lambda x:x[1])]
    sorted_deg_seq = [deg_seq[idx[i]] for i in range(len(deg_seq))]
    
    
    alpha = math.exp(-eps/4)    # global seensitivity = 4
    
    s1 = [0.0 for _ in range(len(deg_seq))]
    
    # s1 (sorted)
    for i in range(len(deg_seq)):
        s1[i] = sorted_deg_seq[i] + geometric_mechanism(alpha, 1)       # Geometric noise
        
    # s
    ir = IsotonicRegression()

    s = ir.fit_transform(range(len(deg_seq)), s1)
    
    #
    return s, s1, sorted_deg_seq
Exemplo n.º 51
0
def test_isotonic_regression_with_ties_in_differently_sized_groups():
    """
    Non-regression test to handle issue 9432:
    https://github.com/scikit-learn/scikit-learn/issues/9432

    Compare against output in R:
    > library("isotone")
    > x <- c(0, 1, 1, 2, 3, 4)
    > y <- c(0, 0, 1, 0, 0, 1)
    > res1 <- gpava(x, y, ties="secondary")
    > res1$x

    `isotone` version: 1.1-0, 2015-07-24
    R version: R version 3.3.2 (2016-10-31)
    """
    x = np.array([0, 1, 1, 2, 3, 4])
    y = np.array([0, 0, 1, 0, 0, 1])
    y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.])
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true)
Exemplo n.º 52
0
def _smacof_single_p(similarities, n_uq, metric=True, n_components=2, init=None,
                   max_iter=300, verbose=0, eps=1e-3, random_state=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    n_uq

    similarities: symmetric ndarray, shape [n * n]
        similarities between the points

    metric: boolean, optional, default: True
        compute metric or nonmetric SMACOF algorithm

    n_components: int, optional, default: 2
        number of dimension in which to immerse the similarities
        overwritten if initial array is provided.

    init: {None or ndarray}, optional
        if None, randomly chooses the initial configuration
        if ndarray, initialize the SMACOF algorithm with this array

    max_iter: int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose: int, optional, default: 0
        level of verbosity

    eps: float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X: ndarray (n_samples, n_components), float
               coordinates of the n_samples points in a n_components-space

    stress_: float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)

    n_iter : int
        Number of iterations run.

    """
    similarities = check_symmetric(similarities, raise_exception=True)

    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    W = np.ones((n_samples, n_samples))
    W[:n_uq, :n_uq] = 0.0
    W[n_uq:, n_uq:] = 0.0
    # W[np.arange(len(W)), np.arange(len(W))] = 0.0

    V = -W
    V[np.arange(len(V)), np.arange(len(V))] = W.sum(axis=1)
    e = np.ones((n_samples, 1))

    Vp = np.linalg.inv(V + np.dot(e, e.T)/n_samples) - np.dot(e, e.T)/n_samples
    # Vp = np.linalg.pinv(V)

    # sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat = similarities.ravel()
    sim_flat_w = sim_flat[sim_flat != 0]
    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" %
                             (n_samples, n_components))
        X = init

    old_stress = None
    ir = IsotonicRegression()
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            # dis_flat = dis.ravel()
            # # similarities with 0 are considered as missing values
            # dis_flat_w = dis_flat[sim_flat != 0]

            # # Compute the disparities using a monotonic regression
            # disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            # disparities = dis_flat.copy()
            # disparities[sim_flat != 0] = disparities_flat
            # disparities = disparities.reshape((n_samples, n_samples))
            # disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
            #                        (disparities ** 2).sum())
            
            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum())
            disparities[similarities==0] = 0

        # Compute stress
        # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
        _stress = (W.ravel()*((dis.ravel() - disparities.ravel()) ** 2)).sum() / 2

        # Update X using the Guttman transform
        # dis[dis == 0] = 1e-5
        # ratio = disparities / dis
        # B = - ratio
        # B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
        # X = 1. / n_samples * np.dot(B, X)
        # print (1. / n_samples * np.dot(B, X))[:5].T

        dis[dis == 0] = 1e-5
        ratio = disparities / dis
        _B = - W*ratio
        _B[np.arange(len(_B)), np.arange(len(_B))] += (W*ratio).sum(axis=1)

        X = np.dot(Vp, np.dot(_B, X))
        # print X[:5].T

        dis = np.sqrt((X ** 2).sum(axis=1)).sum()
        
        if verbose >= 2:
            print('it: %d, stress %s' % (it, _stress))
        if old_stress is not None:
            if(old_stress - _stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' % (it,
                                                                       _stress))
                break
        old_stress = _stress / dis

    return X, _stress, it + 1
Exemplo n.º 53
0
def run_nmds(directory):
    print directory

    if os.path.exists(os.path.join(directory,
                                   "config.ini")):
        config_file = os.path.join(directory, "config.ini")
    else:
        config_file = None

    options = parse(config_file)
    run_mds(directory)

    for i in range(0, max_iter):
        if i == 0:
            try:
                X = np.loadtxt(
                    os.path.join(directory,
                                 "MDS." + options["output_name"] + ".txt"))
            except IOError:
                return
        else:
            X = np.loadtxt(
                os.path.join(directory,
                             '%d.NMDS.' % (i) + options["output_name"] +
                             ".txt"))

        X = X.reshape((len(X) / 3, 3))

        dis = euclidean_distances(X) * 1000
        counts = np.load(
            os.path.join(directory, options["counts"]))
        counts[np.isnan(counts)] = 0

        wish_distances = np.zeros(counts.shape)

        print "Fitting isotonic regression..."
        ir = IsotonicRegression()
        wish_distances[counts != 0] = ir.fit_transform(
            1. / counts[counts != 0],
            dis[counts != 0])
        print "writing wish distances"

        lengths = np.loadtxt(
            os.path.join(directory, options["organism_structure"]))

        try:
            len(lengths)
        except TypeError:
            lengths = np.array([lengths])

        write(wish_distances,
              os.path.join(directory,
                           '%d.NMDS.wish_distances.txt' % i),
              lengths=lengths, resolution=options["resolution"])

        if i == 0:
            shutil.copy(
                os.path.join(directory,
                             "MDS." + options["output_name"] + ".txt"),
                os.path.join(directory,
                             '%d.NMDS.' % (i + 1) + options["output_name"] +
                             ".temp.txt"))
        else:
            shutil.copy(
                os.path.join(directory,
                             '%d.NMDS.' % i + options["output_name"] + ".txt"),
                os.path.join(directory,
                             '%d.NMDS.' % (i + 1) + options["output_name"] +
                             ".temp.txt"))

        locus_coord = options["output_name"].replace(".pdb",".bed")

        cmd = CMD_MDS % (options["binary_mds"],
                         os.path.join(directory,
                                      "%d.NMDS." % (i + 1) +
                                      options["output_name"]),
                         options["resolution"],
                         os.path.join(directory,
                                      options["organism_structure"]),
                         os.path.join(directory,
                                      "%d.NMDS.wish_distances.txt" % (i)),
                         os.path.join(directory,
                                      locus_coord),
                         options["adjacent_beads"],
                         options["chromosomes"],
                         os.path.join(directory,
                                      str(i + 1) + '.NMDS.log'))

        filename = os.path.join(directory, str(i + 1) + '.NMDS.sh')
        fileptr = open(filename, 'wb')
        fileptr.write(cmd)
        fileptr.close()
        st = os.stat(filename)
        os.chmod(filename, st.st_mode | stat.S_IXUSR)
        p =subprocess.Popen(filename.split(), shell='True')
        p.wait()
Exemplo n.º 54
0
def _smacof_with_anchors_single(config, similarities, metric=True, n_components=2, init=None,
				   max_iter=300, verbose=0, eps=1e-3, random_state=None, estimated_dist_weights=None):
	"""
	Computes multidimensional scaling using SMACOF algorithm
	Parameters
	----------
	config : Config object
		configuration object for anchor-tag deployment parameters
	similarities: symmetric ndarray, shape [n * n]
		similarities between the points
	metric: boolean, optional, default: True
		compute metric or nonmetric SMACOF algorithm
	n_components: int, optional, default: 2
		number of dimension in which to immerse the similarities
		overwritten if initial array is provided.
	init: {None or ndarray}, optional
		if None, randomly chooses the initial configuration
		if ndarray, initialize the SMACOF algorithm with this array
	max_iter: int, optional, default: 300
		Maximum number of iterations of the SMACOF algorithm for a single run
	verbose: int, optional, default: 0
		level of verbosity
	eps: float, optional, default: 1e-6
		relative tolerance w.r.t stress to declare converge
	random_state: integer or numpy.RandomState, optional
		The generator used to initialize the centers. If an integer is
		given, it fixes the seed. Defaults to the global numpy random
		number generator.
	Returns
	-------
	X: ndarray (n_samples, n_components), float
			   coordinates of the n_samples points in a n_components-space
	stress_: float
		The final value of the stress (sum of squared distance of the
		disparities and the distances for all constrained points)
	n_iter : int
		Number of iterations run
	last_positions: ndarray [X1,...,Xn]
		An array of computed Xs.
	"""
	NO_OF_TAGS, NO_OF_ANCHORS = config.no_of_tags, config.no_of_anchors
	similarities = check_symmetric(similarities, raise_exception=True)

	n_samples = similarities.shape[0]
	random_state = check_random_state(random_state)

	sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
	sim_flat_w = sim_flat[sim_flat != 0]

	if init is None:
		# Randomly choose initial configuration
		X = random_state.rand(n_samples * n_components)
		X = X.reshape((n_samples, n_components))
		# uncomment the following if weight matrix W is not hollow
		#X[:-2] = Xa
	else:
		# overrides the parameter p
		n_components = init.shape[1]
		if n_samples != init.shape[0]:
			raise ValueError("init matrix should be of shape (%d, %d)" %
							 (n_samples, n_components))
		X = init

	old_stress = None
	ir = IsotonicRegression()

	# setup weight matrix
	if getattr(config, 'weights', None) is not None:
		weights = config.weights
	else:
		weights = np.ones((n_samples, n_samples))
	if getattr(config, 'missingdata', None):
		weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = 0
	if estimated_dist_weights is not None:
		weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = estimated_dist_weights
	diag = np.arange(n_samples)
	weights[diag, diag] = 0

	last_n_configs = []
	Xa = config.anchors
	for it in range(max_iter):
		# Compute distance and monotonic regression
		dis = euclidean_distances(X)

		if metric:
			disparities = similarities
		else:
			dis_flat = dis.ravel()
			# similarities with 0 are considered as missing values
			dis_flat_w = dis_flat[sim_flat != 0]

			# Compute the disparities using a monotonic regression
			disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
			disparities = dis_flat.copy()
			disparities[sim_flat != 0] = disparities_flat
			disparities = disparities.reshape((n_samples, n_samples))
			disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
								   (disparities ** 2).sum())

		# Compute stress
		stress = (weights.ravel()*(dis.ravel() - disparities.ravel()) ** 2).sum() / 2
		#stress = ((dis[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel() - disparities[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel()) ** 2).sum()

		# Update X using the Guttman transform
		dis[dis == 0] = 1e5
		ratio = weights*disparities / dis
		B = - ratio
		B[diag, diag] = 0
		B[diag, diag] = -B.sum(axis=1)

		# Apply update to only tag configuration since anchor config is already known
		
		V = - weights
		V[diag, diag] += weights.sum(axis=1)
		# V_inv = np.linalg.pinv(V)
		V12 = V[-NO_OF_TAGS:, :-NO_OF_TAGS]
		B11 = B[-NO_OF_TAGS:, -NO_OF_TAGS:]
		Zu = X[-NO_OF_TAGS:]
		B12 = B[-NO_OF_TAGS:, :-NO_OF_TAGS]
		V11_inv = np.linalg.inv(V[-NO_OF_TAGS:, -NO_OF_TAGS:]) 
		Xu = V11_inv.dot(B11.dot(Zu) + (B12 - V12).dot(Xa)) 

		# merge known anchors config with new tags config 
		X = np.concatenate((Xa, Xu))
		last_n_configs.append(X)

		#X = (1/n_samples)*B.dot(X)

		#dis = np.sqrt((X ** 2).sum(axis=1)).sum()
		dis = (weights*dis**2).sum() / 2
		if verbose >= 2:
			print('it: %d, stress %s' % (it, stress))
		if old_stress is not None:
			if(old_stress - stress / dis) < eps:
				if verbose:
					print('breaking at iteration %d with stress %s' % (it,
																	   stress))
				break
		old_stress = stress / dis
	return X, stress, it + 1, np.array(last_n_configs)
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

n = 100
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))

###############################################################################
# Fit IsotonicRegression and LinearRegression models

ir = IsotonicRegression()

y_ = ir.fit_transform(x, y)

lr = LinearRegression()
lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

###############################################################################
# plot result

segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y)))
lc.set_linewidths(0.5 * np.ones(n))

fig = plt.figure()
plt.plot(x, y, 'r.', markersize=12)
plt.plot(x, y_, 'g.-', markersize=12)
Exemplo n.º 56
0
test = pd.read_csv(r'csvs\\submit_xB.csv')
testId = test.id.values
test = test.drop('id', axis=1)

##one vs all
ir1 = IsotonicRegression()
ir2 = IsotonicRegression()
ir3 = IsotonicRegression()
ir4 = IsotonicRegression()
ir5 = IsotonicRegression()
ir6 = IsotonicRegression()
ir7 = IsotonicRegression()
ir8 = IsotonicRegression()
ir9 = IsotonicRegression()

y_1 = ir1.fit_transform(cv10fold.ix[:,0], y_train.ix[:,0])
y_2 = ir2.fit_transform(cv10fold.ix[:,1], y_train.ix[:,1])
y_3 = ir3.fit_transform(cv10fold.ix[:,2], y_train.ix[:,2])
y_4 = ir4.fit_transform(cv10fold.ix[:,3], y_train.ix[:,3])
y_5 = ir5.fit_transform(cv10fold.ix[:,4], y_train.ix[:,4])
y_6 = ir6.fit_transform(cv10fold.ix[:,5], y_train.ix[:,5])
y_7 = ir7.fit_transform(cv10fold.ix[:,6], y_train.ix[:,6])
y_8 = ir8.fit_transform(cv10fold.ix[:,7], y_train.ix[:,7])
y_9 = ir9.fit_transform(cv10fold.ix[:,8], y_train.ix[:,8])

#container
cv10fold.calibrated = pd.DataFrame({'id' : id
                , 'Class_1' : y_1
                , 'Class_2' : y_2
                , 'Class_3' : y_3
                , 'Class_4' : y_4
Exemplo n.º 57
0
def IsotonicRegression_pred(y_train, predictions_train, test_preds, bin_step, y_test):
    # Y Training Target sort the y_test
    # X Training Data use the indexes of sorted(y_test)
    # y_train_len=len(y_train)

    # if bin_step<1:
    #     step_count = 1/bin_step
    # else:
    #     step_count = int(math.floor(y_train_len/bin_step))

    # step_element_count = int(math.floor(y_train_len/step_count))

    # bin_start_indexes=np.array(range(0,step_count))*step_element_count

    predictions_np = np.array(predictions_train, float)
    predictions_sorted = np.sort(predictions_np)
    predictions_sorted_indexes = predictions_np.argsort()

    y_train_arranged = np.array(y_train, float)[predictions_sorted_indexes].ravel()
    # not_binned_y_train_arranged         =   y_train_arranged[:]

    # for index in range(len(bin_start_indexes)-1):
    #     pin  = bin_start_indexes[index]
    #     pend = bin_start_indexes[index+1]
    #     y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend])
    # if bin_start_indexes[-1]<y_train_len:
    #     pin  = bin_start_indexes[-1]
    #     pend = y_train_len
    #     y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend])

    ir = IsotonicRegression()

    y_ir = ir.fit_transform(predictions_sorted, y_train_arranged)
    y_ir_pred = ir.predict(predictions_sorted)

    # print "min(y_train_arranged)    :",    min(y_train_arranged)
    # print "max(y_train_arranged)    :",    max(y_train_arranged)
    # print "min(predictions_sorted)  :",    min(predictions_sorted)
    # print "max(predictions_sorted)  :",    max(predictions_sorted)
    # print "min(test_preds)          :",    min(test_preds)
    # print "max(test_preds)          :",    max(test_preds)
    # if max(test_preds)>=max(y_train_arranged):
    # np.arrya(test_preds>max(y_train_arranged))==True

    max_indexes = np.array((np.where(test_preds > max(y_train_arranged))), int).ravel()
    if len(max_indexes) != 0:
        for m_i in max_indexes:
            test_preds[m_i] = max(y_train_arranged)

    test_preds_sorted = np.sort(np.array(test_preds))

    predictions_ir = ir.predict(test_preds)

    ind = np.where(np.isnan(predictions_ir))[0]
    preds_test_min = np.nanmin(predictions_ir)
    if len(ind) != 0:
        for i in ind:
            predictions_ir[i] = preds_test_min

    # ==============WRITING TO CSV================
    # d_train={'y_train'          :np.array(y_train,float)[predictions_sorted_indexes].ravel(),
    #          'y_train_bin'      :np.array(y_train_arranged).ravel(),
    #          'train_preds'      :np.array(predictions_sorted).ravel(),
    #          'train_preds_ir'   :y_ir}

    # df_train=pd.DataFrame(d_train)
    # df_train.to_csv("train_IR.csv")

    # d_test={'y_test'            :np.array(y_test).ravel(),
    #         'test_preds'        :np.array(test_preds).ravel(),
    #         'test_preds_ir'     :predictions_ir}
    # df_test=pd.DataFrame(d_test)
    # df_test.to_csv("test_IR.csv")

    # score_test_ir=ir.score(test_preds,y_test)
    score_test_ir = 0

    return predictions_ir, y_ir_pred, ir.get_params(deep=True), score_test_ir
def interpolation_estimate(Z, Z_constraint,
                           lower=0.5,
                           upper=4,
                           npts=30,
                           ndraw=5000,
                           burnin=1000,
                           estimator='truncated'):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraint`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    ndraw : int
        Number of Gibbs steps to use for estimating
        each expectation.

    burnin : int
        How many Gibbs steps to use for burning in.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower*initial,upper*initial, npts)
    Evalues = []

    n = Z.shape[0]
    L, V, U, S = quadratic_bounds(Z, np.identity(n), Z_constraint)

    if estimator == 'truncated':
        def _estimator(S, Z, Z_constraint):
            L, V, U, _ = quadratic_bounds(Z, np.identity(n), Z_constraint)
            num = mpquad(lambda x: mpexp(-x**2/(2*S**2) -L*x / S**2 + (n-1) * mplog((x+L)/S) + 2 * mplog(x+L)),
                       [0, U-L])
            den = mpquad(lambda x: mpexp(-x**2/(2*S**2) -L*x / S**2 + (n-1) * mplog((x+L)/S)),
                       [0, U-L])
            print num / den, V**2, S, (L, U)
            return num / den
    elif estimator == 'simulate':
        
        state = Z.copy()
        rpy.r.assign('state', state)
        def _estimator(S, state, Z_constraint):
            Z_constraint.covariance = S**2 * np.identity(Z.shape[0])
            e, v, _state = expected_norm_squared(state, 
                                               Z_constraint, ndraw=ndraw,
                                               burnin=burnin)            
            state[:] = _state
            return e

    state = Z.copy()
    for S in Svalues:
        Evalues.append(_estimator(S, state, Z_constraint))
    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso
#     rpy.r.assign('S', Svalues)
#     rpy.r.assign('E', np.array(Evalues))
#     rpy.r('''
#     library(fdrtool);
#     G = gcmlcm(S, E, 'gcm');
#     Sgcm = G$x.knots;
#     Egcm = G$y.knots;
#     ''')
#     Sgcm = np.asarray(rpy.r('Sgcm'))
#     Egcm = np.asarray(rpy.r('Egcm'))
#     interpolant = interp1d(Sgcm, Egcm - (Z**2).sum())

    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError('''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant
def truncated_estimate(Z, Z_constraint,
                      lower=0.5,
                      upper=2,
                      npts=15):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraints`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower*initial,upper*initial, npts)
    Evalues = []

    # use truncated chi to estimate integral
    # with scipy.integrate.quad
    n = Z.shape[0]
    operator = np.identity(n)
    L, V, U, S = quadratic_bounds(Z, operator, Z_constraint)

    for S in Svalues:
        num = quad(lambda x: np.exp(-x**2/(2*S**2) + (n+1) * np.log(x)),
                   L, U)
        den = quad(lambda x: np.exp(-x**2/(2*S**2) + (n-1) * np.log(x)),
                   L, U)
        Evalues.append(num[0] / den[0])
        print num, den

    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso


    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError('''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant


    print L, V, U, S