def FitToDelayData(
        DelayValues, timerange=1000, GenerateImages=True, verbose=0):
    '''
    Loads cropped data and fits Gaussian
    Calculates error using bootstrap
    '''

    freq, binedges = histogram(
        DelayValues, bins=2 * timerange / 25 + 1, range=(-timerange, timerange))
    binedges = 0.5 * (binedges[1:] + binedges[:-1])

    binedges = binedges[freq > 0]
    freq = freq[freq > 0]

    (param, err), chival = normfit(binedges, freq, yerr=sqrt(freq),
                                   ScaleGuess=100, verbose=verbose)  # fit to CTR peak
    p1, p2, p3 = param

    DelayValues = array(DelayValues)
    fRawData = DelayValues[abs(DelayValues) < 500]
    CILower, CIUpper = btp.ci(fRawData, std)
    scaleerr = (CIUpper - std(fRawData)) / 1.96

    CILower, CIUpper = btp.ci(fRawData, mean)
    locerr = (CIUpper - mean(fRawData)) / 1.96
    ##amperr = p3  # currently ignored

    p1err, p2err, p3err = err.diagonal()

    return param, (locerr, scaleerr, p3err)
def ScikitsBootstrap(fdf, loc=0, scale=100,
                     leftsigma=5, rightsigma=5, minsamples=100, verbose=1):
    '''
    parameters from fit of Gaussian are used to clip total range of data
    from this a BCA bootstrap of the error in the loc and scale are found
    by the MLE estimates (std and mean respectively) --> This will ONLY
    work if the data given IS Gaussian
    '''

#    fRawData = fdf.Ampl[abs(fdf.Ampl) < 1000]
    fRawData = fdf.Ampl[
        (fdf.Ampl > loc - leftsigma * scale) & (fdf.Ampl < loc + rightsigma * scale)]
    if verbose > 0:
        print("number of samples", len(fRawData))
    if len(fRawData) < minsamples:
        if verbose > 0:
            print("insufficient data")
        return (1e12, 1e12, 1e12)
    CILower, CIUpper = btp.ci(fRawData, std)
    scaleerr = (CIUpper - std(fRawData)) / 1.96

    CILower, CIUpper = btp.ci(fRawData, mean)
    locerr = (CIUpper - mean(fRawData)) / 1.96
    amperr = 0  # currently ignored
    return (locerr, scaleerr, amperr)
Пример #3
0
def bootstrap_error( data, n_samples=None ):

    x = np.array(data)
    meanx = np.mean(x)    #if debug:    
    
    try:
        if (n_samples):
            CIs = bootstrap.ci(data, scipy.mean, n_samples=n_samples)
        else:
            CIs = bootstrap.ci(data, scipy.mean) #, n_samples=1000)

        err_size = max( (meanx - CIs[0]), (CIs[1] - meanx) )
        return CIs
    except (ValueError):
        CIs = None
    X = [] ## estimates

    stdx = np.std(x)
    for xx in xrange(1000): ## do this 1000 times
        X.append( np.mean( x[np.random.randint(len(x),size=len(x))] ) )
    #if debug:
    #    print len(X)
        #print X
    mean_X = np.mean(X)
    std_X = np.std(X)           
    ## re-sample means are not guaranteed to be quite right.
    ## Conf 0.95, loc=sample mean, scale = (np.std(X, ddof=1)/np.sqrt(len(X)))
    conf_int = stats.norm.interval(0.95, loc=mean_X, scale=stats.sem(X))
    
    err_size = max( (mean_X - conf_int[0]), (conf_int[1] - mean_X) )
    
    if (np.isnan(err_size)):
        err_size = 0
        
    return conf_int    
Пример #4
0
def _test_bootci(n_samples=10000, method='bca'):
    import scikits.bootstrap as boot
    import time

    np.random.seed(110820)
    dat = np.random.randn(1000, 5)
    
    @jit(nopython=True)
    def func(d):
        return np.array([np.mean(d[:, 0]), np.median(d[:, 1]), np.max(d[:, 2])])

    st = time.time()
    res = bootci_nb(dat, func, alpha=0.05, n_samples=n_samples, method=method)
    et = (time.time() - st)
    print(res)
    print('Time: %1.2f sec' % et)

    st = time.time()
    a = boot.ci(dat[:, 0], statfunction=np.mean, n_samples=n_samples, method=method)
    b = boot.ci(dat[:, 1], statfunction=np.median, n_samples=n_samples, method=method)
    c = boot.ci(dat[:, 2], statfunction=np.max, n_samples=n_samples, method=method)
    et = (time.time() - st)

    print('Mean_0', a)
    print('Median_1', b)
    print('Median_2', c)
    print('Time: %1.2f sec' % et)
Пример #5
0
def _test_bootci_pd(n_samples=10000, method='bca'):
    import scikits.bootstrap as boot
    import time

    df = pd.DataFrame(np.random.randn(100, 5))

    def func(d):
        return {'MeanA': d[0].mean(), 'MedianB': np.median(d[1])}

    def func2(d):
        return d.mean()

    st = time.time()
    res = bootci_pd(df, func, alpha=0.05, n_samples=n_samples, method=method)
    et = (time.time() - st)
    print(res)
    print('Time: %1.2f sec' % et)

    st = time.time()
    a = boot.ci(df[0].values,
                statfunction=np.mean,
                n_samples=n_samples,
                method=method)
    b = boot.ci(df[1].values,
                statfunction=np.median,
                n_samples=n_samples,
                method=method)
    et = (time.time() - st)

    print('MeanA', a)
    print('MedianB', b)
    print('Time: %1.2f sec' % et)
def fit_learning_curve(data, length=10, user_length=None, context_answer_limit=100, reverse=False, bootstrap_samples=100):
    confidence_vals = [[] for i in range(length)]

    def _fit_learning_curve(series):
        references_by_attempt = map(lambda references: [r for r in references if r is not None], zip(*series))
        learning_curve = map(lambda xs: (numpy.mean(xs), len(xs)), references_by_attempt)

        def _learn_fun(attempt, a, k):
            return a * (1.0 / (attempt + 1) ** k)

        opt, _ = curve_fit(
            _learn_fun,
            numpy.arange(len(learning_curve)),
            numpy.array(map(lambda x: x[0], learning_curve)),
            sigma=numpy.array(map(lambda x: 1.0 / numpy.sqrt(x[1] + 1), learning_curve))
        )
        fit = map(lambda attempt: _learn_fun(attempt, opt[0], opt[1]), range(len(learning_curve)))
        for i, r in enumerate(fit):
            confidence_vals[i].append(r)
        return fit[-1]

    series = reference_series(data, length=length, user_length=user_length,
        context_answer_limit=context_answer_limit, reverse=reverse)
    try:
        bootstrap.ci(series, _fit_learning_curve, method='pi', n_samples=bootstrap_samples)

        def _aggr(rs):
            return {
                'value': numpy.median(rs),
                'confidence_interval_min': numpy.percentile(rs, 2),
                'confidence_interval_max': numpy.percentile(rs, 98),
            }
        return map(_aggr, confidence_vals)
    except:
        return []
Пример #7
0
 def test_bca_errorbar_output_simple(self):
     np.random.seed(1234567890)
     results_default = boot.ci(self.data)
     np.random.seed(1234567890)
     results_errorbar = boot.ci(self.data, output='errorbar')
     np.testing.assert_array_almost_equal(
         results_errorbar.T,
         abs(np.average(self.data) - results_default)[np.newaxis])
Пример #8
0
def ScikitsBootstrap(fdf):
    CILower, CIUpper = btp.ci(fdf.counts, std)
    scaleerr = (CIUpper - std(fdf.counts)) / 1.96

    CILower, CIUpper = btp.ci(fdf.counts, mean)
    locerr = (CIUpper - mean(fdf.counts)) / 1.96
    amperr = 0  # currently ignored
    return (locerr, scaleerr, amperr)
Пример #9
0
 def test_pi_multi_2dout_multialpha(self):
     np.random.seed(1234567890)
     results1 = boot.ci((self.x,self.y), stats.linregress, alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.random.seed(1234567890)
     results2 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: stats.linregress(a)[0], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.random.seed(1234567890)
     results3 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: stats.linregress(a)[1], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.testing.assert_array_almost_equal(results1[:,0],results2)
     np.testing.assert_array_almost_equal(results1[:,1],results3)
Пример #10
0
    def print_switches_per_model(self,
                                 models=('early', 'enes', 'esen'),
                                 l2_epoch=25,
                                 print_total=True,
                                 n_sample=10000,
                                 print_per_lang=True,
                                 switch_type=('alternational', 'insertional',
                                              'ambiguous')):
        for m in models:
            print(m)
            if print_total:
                df = pd.read_csv(
                    f'{self.results_dir}/{m}{self.fname_suffix}/performance.csv',
                    index_col=None,
                    header=0,
                    skipinitialspace=True,
                    dtype={'epoch': int})
                df = df[df.epoch == (
                    l2_epoch if m == 'early' else df.epoch.max())]
                print(df.epoch.max())
                cs_sum = df[f'alternational_percentage'] + df[
                    f'insertional_percentage'] + df[f'ambiguous_percentage']
                low, high = boot.ci(cs_sum, n_samples=n_sample)
                print('TOTAL', round(cs_sum.mean(), 1), 'CI:', round(low, 1),
                      round(high, 1))
                for stype in switch_type:
                    low, high = boot.ci(df[f'{stype}_percentage'],
                                        n_samples=n_sample)
                    print(stype, round(df[f'{stype}_percentage'].mean(), 1),
                          'CI:', round(low, 1), round(high, 1))

            if print_per_lang:
                df = pd.read_csv(
                    f'{self.results_dir}/{m}{self.fname_suffix}/performance_per_lang.csv',
                    index_col=None,
                    header=0,
                    skipinitialspace=True,
                    dtype={'epoch': int})
                df = df[df.epoch == (
                    l2_epoch if m == 'early' else df.epoch.max())]
                for lang in self.languages:
                    print('per lang:', lang)
                    df_lang = df[df.switch_from == lang]

                    cs_sum = (df_lang[f'alternational_percentage'] +
                              df_lang[f'insertional_percentage'] +
                              df_lang[f'ambiguous_percentage'])
                    low, high = boot.ci(cs_sum, n_samples=n_sample)
                    print('TOTAL per lang', round(cs_sum.mean(), 1), 'CI:',
                          round(low, 1), round(high, 1))

                    for stype in switch_type:
                        low, high = boot.ci(df_lang[f'{stype}_percentage'],
                                            n_samples=n_sample)
                        print(stype,
                              round(df_lang[f'{stype}_percentage'].mean(), 1),
                              'CI:', round(low, 1), round(high, 1))
Пример #11
0
 def test_pi_multi_2dout_multialpha(self):
     np.random.seed(1234567890)
     results1 = boot.ci((self.x,self.y), lambda a,b: np.polyfit(a,b,1), alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.random.seed(1234567890)
     results2 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: np.polyfit(a[:,0],a[:,1],1)[0], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.random.seed(1234567890)
     results3 = boot.ci(np.vstack((self.x,self.y)).T, lambda a: np.polyfit(a[:,0],a[:,1],1)[1], alpha=(0.1,0.2,0.8,0.9),n_samples=2000,method='pi')
     np.testing.assert_array_almost_equal(results1[:,0],results2)
     np.testing.assert_array_almost_equal(results1[:,1],results3)
Пример #12
0
 def test_bca_multi_multialpha(self):
     np.random.seed(1234567890)
     results1 = boot.ci((self.x, self.y),
                        lambda a, b: stats.linregress(a, b)[1],
                        alpha=(0.1, 0.2, 0.8, 0.9),
                        n_samples=1000)
     np.random.seed(1234567890)
     results2 = boot.ci(np.vstack((self.x, self.y)).T,
                        lambda a: stats.linregress(a)[1],
                        alpha=(0.1, 0.2, 0.8, 0.9),
                        n_samples=1000)
     np.testing.assert_array_almost_equal(results1, results2)
Пример #13
0
 def test_bca_multi_multialpha(self):
     np.random.seed(1234567890)
     results1 = boot.ci((self.x, self.y),
                        lambda a, b: np.polyfit(a, b, 1),
                        alpha=(0.1, 0.2, 0.8, 0.9),
                        n_samples=1000)
     np.random.seed(1234567890)
     results2 = boot.ci(np.vstack((self.x, self.y)).T,
                        lambda a: np.polyfit(a[:, 0], a[:, 1], 1),
                        alpha=(0.1, 0.2, 0.8, 0.9),
                        n_samples=1000)
     np.testing.assert_array_almost_equal(results1, results2)
Пример #14
0
def esci_indep_cohens_d(data1, data2, n_boot=5000, has_preds=False):
    '''Compute Cohen's d effect size and its bootstrap 95% confidence interval.
    (using bias corrected accelerated bootstrap).

    Parameters
    ----------
    data1 : np.ndarray
        One dimensional array of values for the "high" group (for example
        diagnosed participants).
    data2 : np.ndarray
        One dimensional array of values for the "low" group (for example
        healthy controls).
    n_boot : int
        Number of bootstraps to use.
    has_preds : bool
        Wheter array of predictors is provided in the data. If so the first
        column of data1 and data2 are data for separate groups and the
        following columns are the predictors used in regression with the
        predictor of interest (group membership) being the last one
        and the rest treated as confounds.

    Returns
    -------
    stats : dict
        Dictionary of results.
        * ``stats['es']`` contains effect size.
        * ``stats['ci']`` contains 95% confidence interval for the effect size.
        * ``stats['bootstraps']`` contains bootstrap effect size values.
    '''
    if not has_preds:
        assert data2 is not None
        import dabest
        df = utils.psd_to_df(data1, data2)
        dbst_set = dabest.load(df,
                               idx=("controls", "diagnosed"),
                               x="group",
                               y="FAA",
                               resamples=n_boot)
        results = dbst_set.cohens_d.results
        cohen_d = results.difference.values[0]
        cohen_d_ci = (results.bca_low.values[0], results.bca_high.values[0])
        bootstraps = results.bootstraps[0]
    else:
        from borsar.stats import compute_regression_t
        import scikits.bootstrap as boot

        def regression_Cohens_d(data1, data2):
            data = np.concatenate([data1, data2], axis=0)
            preds = data[:, 1:]
            tvals = compute_regression_t(data[:, [0]], preds)
            return d_from_t_categorical(tvals[-1, 0], preds)

        cohen_d = regression_Cohens_d(data1, data2)
        cohen_d_ci, bootstraps = boot.ci((data1, data2),
                                         regression_Cohens_d,
                                         multi='independent',
                                         n_samples=n_boot,
                                         return_dist=True)
    stats = dict(es=cohen_d, ci=cohen_d_ci, bootstraps=bootstraps)
    return stats
Пример #15
0
def rci_boot(x, y, alpha=0.95, verbose=True, n_samples=10000, method='bca'):
	"""
	Calculate a confidence interval for the Pearson correlation coefficient r
	between two series 'x' and 'y' using the bootstrap method. It is helpful to
	compare the bootstrapped Confidence Intervals (CIs) for the Pearson correlation
	coefficient r with the CIs obtained with the more standard Fisher’s transformation
	method, as suggested by Cox (2008).

	References
	----------
	Cox (2008): Speaking Stata: Correlation with confidence, or Fisher’s z revisited.
	The Stata Journal (2008) 8, Number 3, pp. 413-439.

	Example
	-------
	TODO
	"""
	x,y = map(np.asanyarray, (x,y))
	## Bootstrapped confidence intervals.
	xl, xu = bootstrap.ci((x, y), statfunction=rcoeff, alpha=(1-alpha), n_samples=n_samples, method=method, multi=True)

	if verbose:
		print ""
		print "Bootstrapped CI (xl,xu): (%.3f,%.3f)"%(xl, xu)
		print ""

	return (xl,xu)
Пример #16
0
def forcedChoicePlot(listenerAccuracies, listenerScores, mturkAccuracies, mturkScores, outFile, title, errorBars=False):
  """listenerAccuracies is an array of accuracy arrays, one per problem level.
     mturkAccuracies is a 1-d array of mturk accuracies on each problem level. 
  """
  matplotlib.rcParams.update({'font.size' : 20})
  lw = 4
  plt.hold(True)
  nListeners = len(listenerAccuracies)
  nIterations = len(listenerAccuracies[0]) - 1
  plt.axis([0, nIterations, 0, 1])
  plt.ylabel('Listener Accuracy')
  plt.xlabel('Training Iterations')
  for levelAccuracies, levelScores, lineColor in zip(listenerAccuracies, listenerScores, colors):
    if errorBars: 
      yerrs = []
      for scores in levelScores: 
        if np.array(scores).all():
          yerrs.append(0)
        else:
          interval = boot.ci(np.array(scores), np.average)
          err = (interval[1] - interval[0]) / 2.0
          yerrs.append(err)
      plt.errorbar(range(len(levelAccuracies)), levelAccuracies, yerr=yerrs, linewidth=lw, color=lineColor)
      print lineColor
      print levelAccuracies
    else:
      plt.plot(levelAccuracies, linewidth=lw, marker='o', color=lineColor) 
  listenerTitles = ['Level %d' % level for level in range(nListeners)]
  plt.legend(listenerTitles, loc='lower right')
  plt.title(title)
  plt.savefig(outFile, format='pdf')
  plt.show()
Пример #17
0
def stats_per_group(x):
    print 'stats-per-group'

    x = x.groupby(['sid']).mean()
    x = x.value

    print len(x)

    res = {'median': [], 'qtile': []}
    medians = np.median(x)
    res['mean'] = np.average(x)
    res['median'] = medians
    lower_quartile, upper_quartile = np.percentile(x, [25, 75])
    res['qtile'] = (upper_quartile, lower_quartile)
    # res['ci'] = np.percentile(x, [2.5,97.5])
    iqr = upper_quartile - lower_quartile
    upper_whisker = x[x <= upper_quartile + 1.5 * iqr].max()
    lower_whisker = x[x >= lower_quartile - 1.5 * iqr].min()
    res['whisk'] = (lower_whisker, upper_whisker)
    res['err'] = (np.abs(lower_whisker - medians),
                  np.abs(upper_whisker - medians))

    res['ci'] = bootstrap.ci(x, n_samples=BOOTSTRAP_NUM)

    return pd.Series(res)
Пример #18
0
	def totalNspks(self):
		"""
		Compute statistical comparisons of total nosepokes in no inhibition versus inhibition session of NpHR subjects 

		Return dictionary with means, sems, p-value, bootstrapped 95 percent CI
		"""
		totalNspks = {}
		totalNspks['controlMean'] = self.datadict['totalNspksControl']['NoInhib'].mean()
		totalNspks['controlSEM'] = self.datadict['totalNspksControl']['NoInhib'].sem()
		totalNspks['controlCI'] = bootstrap.ci(data=self.datadict['totalNspksControl']['NoInhib'], statfunction=scipy.mean)
		totalNspks['inhibMean'] = self.datadict['totalNspksInhibited']['Inhibited'].mean()
		totalNspks['inhibSEM'] = self.datadict['totalNspksInhibited']['Inhibited'].sem()
		totalNspks['inhibCI'] = bootstrap.ci(data=self.datadict['totalNspksInhibited']['Inhibited'], statfunction=scipy.mean)
		totalNspks['p'] = scipy.stats.ttest_rel(self.datadict['totalNspksControl']['NoInhib'], self.datadict['totalNspksInhibited']['Inhibited'])

		return totalNspks
Пример #19
0
	def meanNspksInhib(self):
		"""
		Compute statistical comparisons of mean nosepokes in laser versus simlaser in inhibition session

		Return dictionary with means, sems, p-value, bootstrapped 95 percent CI
		"""
		meanNspksInhib = {}
		meanNspksInhib['simMean'] = self.datadict['meanNspksInhibited']['simLaser'].mean()
		meanNspksInhib['simSEM'] = self.datadict['meanNspksInhibited']['simLaser'].sem()
		meanNspksInhib['simCI'] = bootstrap.ci(data=self.datadict['meanNspksInhibited']['simLaser'], statfunction=scipy.mean)
		meanNspksInhib['laserMean'] = self.datadict['meanNspksInhibited']['Laser'].mean()
		meanNspksInhib['laserSEM'] = self.datadict['meanNspksInhibited']['Laser'].sem()
		meanNspksInhib['laserCI'] = bootstrap.ci(data=self.datadict['meanNspksInhibited']['Laser'], statfunction=scipy.mean)
		meanNspksInhib['p'] = scipy.stats.ttest_rel(self.datadict['meanNspksInhibited']['simLaser'], self.datadict['meanNspksInhibited']['Laser'])

		return meanNspksInhib
Пример #20
0
 def run_for_all(self, bound_response, sfs, fff, blank, n_samples=500):
     bf = BootFit()
     bf.sfs = sfs
     bf.fff = fff.mean if fff else None
     bf.blank = blank.mean if blank else None
     self.n_samples = n_samples
     msg = ('Performing {} samples... ').format(self.n_samples)
     print msg
     try:
         self.interval = bootstrap.ci(
             data = bound_response,
             statfunction = bf.stat_for_all,
             n_samples = self.n_samples
         )
         stats = bf.rvs[:self.n_samples]
         self.mean, self.std = np.nanmean(stats), np.nanstd(stats)
     except Exception as e:
         stats = bf.rvs[:self.n_samples]
         self.mean, self.std = np.nanmean(stats), np.nanstd(stats)
         sys.stderr.write(str(e))
         sys.stderr.flush()
     # print 'INTERVAL:{s.interval}, MEAN: {s.mean}, STD: {s.std}'.format(s=self)
     # print stats
     print '{} unique preferred SF were made.'.format(len(set(stats)))
     return self
Пример #21
0
def confidence_intervals(system_scores, baseline_scores, gold_scores):
    """
    Compute BCa confidence intervals for a system compared to a baseline.
    :param system_scores: list of system's scores
    :param baseline_scores: list of baseline method's scores
    :param gold_scores: list of gold scores
    :return: dict containing system and baseline Pearson correlation,
        delta between them, and confidence interval
    """
    system_prs = pearsonr(gold_scores, system_scores)[0]
    baseline_prs = pearsonr(gold_scores, baseline_scores)[0]

    data = list(zip(gold_scores, system_scores, baseline_scores))

    def statistic(data):
        gs = data[:, 0]
        sys = data[:, 1]
        base = data[:, 2]
        r1 = pearsonr(gs, sys)[0]
        r2 = pearsonr(gs, base)[0]
        return r1 - r2

    conf_int = bootstrap.ci(data, statfunction=statistic, method='bca')
    return {
        'system': system_prs,
        'baseline': baseline_prs,
        'delta': system_prs - baseline_prs,
        'conf_int': list(conf_int),
    }
Пример #22
0
 def _metrics_stats_fn(preds_and_labels):
     # Slice predictions and labels into batches and compute metrics on them.
     metric_values = np.asarray(metric_fn(preds_and_labels))
     # Compute metric mean and CI using bootstrap.
     metric_mean = np.mean(metric_values)
     metric_ci = boot.ci(metric_values, np.mean, alpha=(1.0 - ci / 100.0))
     return metric_mean, metric_ci
Пример #23
0
def flag_outlier(in_vec, thresh_percentage=95):
    """
    Flags an outlier according to a percent difference threshold
    :param thresh_percentage: percent confidence interval
    :param in_vec:
    :return: outlier_ind
    """
    in_vec = np.array(in_vec)

    # find largest outlier
    outlier_ind = 0
    l2_resid_old = 0
    mask = np.ones(len(in_vec), dtype=bool)
    for i in xrange(in_vec.shape[0]):
        mask[i] = False
        l2_resid = (in_vec[i] - np.mean(in_vec[mask]))**2

        if l2_resid > l2_resid_old:
            outlier_ind = i

        l2_resid_old = l2_resid
        mask[i] = True

    # check if outlier is outside threshold percentage
    # bootstrap a 95% ci from data
    a_lvl = 1 - (thresh_percentage / 100.)
    CIs = bootstrap.ci(data=in_vec, statfunction=mean, alpha=a_lvl)
    if in_vec[outlier_ind] < CIs[0] or in_vec[outlier_ind] > CIs[1]:
        return outlier_ind
    else:
        return None
Пример #24
0
def rci_boot(x, y, alpha=0.95, verbose=True, n_samples=10000, method='bca'):
	"""
	Calculate a confidence interval for the Pearson correlation coefficient r
	between two series 'x' and 'y' using the bootstrap method. It is helpful to
	compare the bootstrapped Confidence Intervals (CIs) for the Pearson correlation
	coefficient r with the CIs obtained with the more standard Fisher’s transformation
	method, as suggested by Cox (2008).

	References
	----------
	Cox (2008): Speaking Stata: Correlation with confidence, or Fisher’s z revisited.
	The Stata Journal (2008) 8, Number 3, pp. 413-439.

	Example
	-------
	TODO
	"""
	x,y = map(np.asanyarray, (x,y))
	## Bootstrapped confidence intervals.
	xl, xu = bootstrap.ci((x, y), statfunction=rcoeff, alpha=(1-alpha), n_samples=n_samples, method=method, multi=True)

	if verbose:
		print("")
		print("Bootstrapped CI (xl,xu): (%.3f,%.3f)"%(xl, xu))
		print("")

	return (xl,xu)
Пример #25
0
def plot_serial(all_s, color, label=None, xk=None, nan=False):
    mean = np.mean
    if nan:
        mean = nanmean
    if xk is None:
        xx = xxx2
    else:
        xx = xk
    stderr = array([
        ci(sb, statfunction=mean, alpha=1 - 0.68, method="pi")
        for sb in (all_s).T
    ])
    if not label:
        fill_between(xx,
                     degrees(stderr[:, 0]),
                     degrees(stderr[:, 1]),
                     color=color,
                     alpha=0.2)
    else:
        fill_between(xx,
                     degrees(stderr[:, 0]),
                     degrees(stderr[:, 1]),
                     color=color,
                     alpha=0.2,
                     label=label)
    plot(xx, degrees(mean(all_s, 0)), color=color)
    plot(xx, zeros(len(xx)), "k--", alpha=0.5)
    if type_ori:
        xlabel(r"relative orientation of previous trial ($^\circ$)")
    else:
        xlabel(r"relative color of previous trial ($^\circ$)")
    ylabel(r"error on current trial ($^\circ$)")
    #legend()
    sns.despine()
    ylim(-2, 3)
Пример #26
0
def bootstrapCI(data, statFunc=None, alpha=0.05, nPerms=10000, output='lowhigh', method='pi'):
    """Wrapper around a function in the scikits_bootstrap module:
        https://pypi.python.org/pypi/scikits.bootstrap

    Parameters
    ----------
    data : np.ndarray
        Data for computing the confidence interval.
    statFunc : function
        Should take data and operate along axis=0
    alpha : float
        Returns the [alpha/2, 1-alpha/2] percentile confidence intervals.
    nPerms : int
    output : str
        Use 'lowhigh' or 'errorbar', for matplotlib errorbars"""
    if statFunc is None:
        statFunc = partial(np.nanmean, axis=0)
    try:
        out = ci(data=data, statfunction=statFunc, alpha=alpha, n_samples=nPerms, output='lowhigh', method=method)
    except IndexError:
        shp = list(data.shape)
        shp[0] = 2
        out = np.nan * np.ones(shp)
    
    if output == 'errorbar':
        mu = statFunc(data)
        shp = list(out.shape)
        
        out[0,:] = out[0,:] - mu
        out[1,:] = mu - out[1,:]
        out = np.reshape(out, shp)
    return out
Пример #27
0
def plot(data_arr, data_shuf_arr, data_err_arr, file_desc='sample'):

    rcParams['pdf.fonttype'] = 42
    rcParams['ps.fonttype'] = 42
    rcParams['font.family'] = 'sans-serif'
    rcParams['font.sans-serif'] = ['Arial']

    cmm = np.mean(data_arr, axis=1)
    cshufmm = np.mean(data_shuf_arr, axis=1)
    cerrmm = np.mean(data_err_arr, axis=1)

    data_boot = [
        boot.ci(data_arr[b, :], np.mean, n_samples=1000) for b in range(5)
    ]
    data_shuf_boot = [
        boot.ci(data_shuf_arr[b, :], np.mean, n_samples=1000) for b in range(5)
    ]
    data_err_boot = [
        boot.ci(data_err_arr[b, :], np.mean, n_samples=1000) for b in range(5)
    ]

    (fh, ax) = plt.subplots(1, 1, figsize=(5 / 2.54, 5 / 2.54), dpi=300)
    ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_boot],
                    [x[1] for x in data_boot],
                    color='r',
                    alpha=0.2)
    ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_shuf_boot],
                    [x[1] for x in data_shuf_boot],
                    color='k',
                    alpha=0.2)
    ax.fill_between(np.arange(1, 6) + 0.5, [x[0] for x in data_err_boot],
                    [x[1] for x in data_err_boot],
                    color='b',
                    alpha=0.2)

    ax.plot(np.arange(1, 6) + 0.5, np.mean(data_arr, axis=1), '-r')
    ax.plot(np.arange(1, 6) + 0.5, np.mean(data_shuf_arr, axis=1), '-k')
    ax.plot(np.arange(1, 6) + 0.5, np.mean(data_err_arr, axis=1), '-b')
    ax.set_xlim((1, 6))
    ax.set_ylim((0.3, 0.8))
    ax.set_yticks((0.4, 0.6, 0.8))
    ax.set_xticks((1, 6))
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('Classification accuracy')
    fh.savefig('4su_stp_decoding_{}.pdf'.format(file_desc),
               bbox_inches='tight')
    return fh
Пример #28
0
 def test_bca_n_samples(self):
     np.random.seed(1234567890)
     results = boot.ci(self.data,
                       np.average,
                       alpha=(0.1, 0.2, 0.8, 0.9),
                       n_samples=500)
     np.testing.assert_array_almost_equal(
         results, np.array([0.40027628, 0.5063184, 0.94082515, 1.05653929]))
Пример #29
0
 def test_abc_multialpha_unified(self):
     results = boot.ci(self.data,
                       lambda x, weights: np.average(x, weights=weights),
                       alpha=(0.1, 0.2, 0.8, 0.9),
                       method='abc')
     np.testing.assert_array_almost_equal(
         results, np.array([0.39472915, 0.51161304, 0.93789723,
                            1.04407254]))
Пример #30
0
def calc_bootstrap(data):
    # Calculate the bootstrap
    CIs = bootstrap.ci(data=data, statfunction=sp.mean)
    
    # Print the data: the "*" turns the array CIs into a list
    print('The conficence intervals for the mean are: {0} - {1}'.format(*CIs))
    
    return CIs
Пример #31
0
def calc_bootstrap(data):
    # Calculate the bootstrap
    CIs = bootstrap.ci(data=data, statfunction=sp.mean)

    # Print the data: the "*" turns the array CIs into a list
    print('The conficence intervals for the mean are: {0} - {1}'.format(*CIs))

    return CIs
 def get_ci(data, ci):
     try:
         ci_vals = bootstrap.ci(data=data, alpha = ci, 
                                statfunction=print_class, 
                                n_samples = 10)
     except:
         ci_vals = [-1.0,1.0]
     return ci_vals
Пример #33
0
def summSC(scfile):

    #Create merged and summarised dataframe
    df=pd.DataFrame.from([map(osp.basename, imlist),
                     [labelset[i] for i in gtlist], [labelset[i] for i in estlist]]).T
    df.columns=['image','observed','predicted']
    df=df.merge(sunits,left_on='image',right_on='image', how='left')
    df=pd.melt(df, id_vars=['sampleunit','image'], 
               value_vars=['observed','predicted'],
               var_name='method', value_name='label')
    df=df.groupby(['sampleunit','method','label']).size().reset_index(name='count')
    df=df.groupby(['sampleunit','method','label']).agg({'count': 'sum'})
    df=df.groupby(level=['sampleunit','method']).apply(lambda x: 100 * x / float(x.sum())).reset_index()
    df=df.merge(lsmap, on='label', how='left')

    df=df.groupby(['sampleunit','method','tier3_name'])['count'].agg({'count':np.sum}).reset_index()
    df=df.rename(index=str, columns={"tier3_name": "label"})
    df=df.pivot_table(index=['sampleunit','label'], columns='method',
                      values='count').reset_index().fillna(value=0)

    df['error']=abs(df['observed']-df['predicted'])
    df=df.groupby('label')['error'].agg({'mean': np.mean, 
                                     'std': np.std, 
                                     'cilow': lambda x: bootstrap.ci(x, statfunction=scipy.mean)[0],
                                     'cimax':lambda x: bootstrap.ci(x, statfunction=scipy.mean)[1],
                                   }).reset_index()
    
    #Plot Mean Absolute Error as the absolute diference between machine predictions and manual observations from test images.
    cierror=[df['cilow'],df['cimax']]
    plot = df.plot(kind='bar',
               y='mean',
               x='label',
               yerr=cierror,
               color='DarkGreen',
               edgecolor='black',
               grid=False,
               figsize=(8,2),
               position=0.45,
               error_kw=dict(ecolor='black',elinewidth=0.5),
               width=0.8,
              legend=False,
               rot=90,
              fontsize=9)
    plot.set_xlabel('Labels', fontsize=12)
    plot.set_ylabel('Mean Absolute Error (%)', fontsize=12)
    plot.xaxis.set_tick_params('labelcenter')
Пример #34
0
 def test_pi_multialpha(self):
     np.random.seed(1234567890)
     results = boot.ci(self.data,
                       np.average,
                       method='pi',
                       alpha=(0.1, 0.2, 0.8, 0.9))
     np.testing.assert_array_almost_equal(
         results, np.array([0.40351601, 0.51723236, 0.94547054,
                            1.05749207]))
Пример #35
0
def quick_ci(g, x, fun=scipy.mean, alpha=0.05, n=200):
    import warnings
    warnings.simplefilter("ignore", bootstrap.InstabilityWarning)
    l, h = bootstrap.ci(data=g, statfunction=fun, alpha=alpha, n_samples=n)
    return pandas.DataFrame({
        x: g.name,
        'mean': g.mean(),
        'low': l,
        'high': h
    },
                            index=[g.name])
Пример #36
0
def esci_regression_r(x, y, n_boot=5000):
    '''Compute Pearson's r effect size and its bootstrap 95% confidence
    interval (using bias corrected accelerated bootstrap).

    Parameters
    ----------
    x : np.ndarray
        Predictors - one or two-dimensional array of values for the
        correlation. If predictors are two-dimensional the last column is
        treated as the predictor of interest and the rest as confounds.
    y : np.ndarray
        Dependent variable. One dimensional array of values for the
        correlation.
    n_boot : int
        Number of bootstraps to use.

    Returns
    -------
    stats : dict
        Dictionary of results.
        * ``stats['es']`` contains effect size.
        * ``stats['ci']`` contains 95% confidence interval for the effect size.
        * ``stats['bootstraps']`` contains bootstrap effect size values.
    '''
    # use pearson correlation
    from scipy.stats import pearsonr
    import scikits.bootstrap as boot

    stats = dict()

    if x.ndim == 1:
        # normal correlation
        def corr(x, y):
            return pearsonr(x, y)[0]
    else:
        from borsar.stats import compute_regression_t

        # we use regression t value and then turn it to r
        def corr(x, y):
            tvals = compute_regression_t(y[:, np.newaxis], x)
            return r_from_t(tvals[-1, 0], x)

    r = corr(x, y)
    # currently this is available only on my branch of scikits-bootstrap
    # but I'll prepare a PR to the github repo, and it will be available
    # when/if it gets accepted
    r_ci, bootstraps = boot.ci((x, y),
                               corr,
                               multi=True,
                               n_samples=n_boot,
                               return_dist=True)
    stats.update(bootstraps=bootstraps)
    stats.update(es=r, ci=r_ci)
    return stats
 def bootstrap(self):
     """
     performs bootrapping of f1 measure on dataset. A narrow confidence interval is more indicative of a sufficient sample size
     A 95% confidence interval means we are 95% confident that the true f1 measure is between (1) and (2).
     ( 1 and 2 are values return by bootstrap library).
     :return:
     """
     data = list(self.algorithm_results.items())
     CIs = bootstrap.ci(data=data, statfunction=self.f1_bootstrap, n_samples=10000)
     print(self.algorithm_name)
     print("Bootstrapped 95% confidence intervals for f1 \nLow:", CIs[0], "\nHigh:", CIs[1])
Пример #38
0
 def calculate(self, questionsNAnswers):
     if None in [q.answer() for q in questionsNAnswers]:
         raise ValueError()
     sample = [1. if q.answer() else 0. for q in questionsNAnswers]
     if sample == [
             0.
     ] * len(questionsNAnswers):  # bootstrap fails if we pass all zeroes
         return [0., 0.]
     print(sample)
     percentageCI = bootstrap.ci(data=sample, statfunction=scipy.mean)
     return [b * len(self.__corpora.words()) for b in percentageCI]
Пример #39
0
def ci_eval(samples):
    # alpha sets the confidence interval to 1 sigma
    # bootstrap gives us a lower and upper errorbar
    # we assume them to be almost equal such that
    # a simple average is justified
    return np.average(
        boot.ci(samples,
                statfunction=(lambda x: np.average(np.abs(x))),
                alpha=(1. - 0.6827),
                n_samples=5000,
                method='bca',
                output='errorbar'))
def calc_bootstrap(data):
    ''' Find the confidence interval for the mean of the given data set with bootstrapping. '''
    
    # --- >>> START stats <<< ---
    # Calculate the bootstrap
    CIs = bootstrap.ci(data=data, statfunction=sp.mean)
    # --- >>> STOP stats <<< ---
    
    # Print the data: the "*" turns the array "CIs" into a list
    print(('The conficence intervals for the mean are: {0} - {1}'.format(*CIs)))
    
    return CIs
Пример #41
0
def calc_bootstrap(data):
    """ Find the confidence interval for the mean of the given data set with bootstrapping. """
    
    # --- >>> START stats <<< ---
    # Calculate the bootstrap
    CIs = bootstrap.ci(data=data, statfunction=sp.mean)
    # --- >>> STOP stats <<< ---
    
    # Print the data: the "*" turns the array "CIs" into a list
    print(f'The conficence intervals for the mean are: {CIs[0]} - {CIs[1]}')
    
    return CIs
Пример #42
0
def boot_bin_stack(data_bin, n_samples=3000):
    warnings.filterwarnings("ignore")
    count = data_bin.shape[0]
    if count > 1:
        if n_samples is not None:
            cci = ci(data_bin, n_samples=n_samples)
        else:
            cci = np.array([np.nan, np.nan])
        mu = np.average(data_bin)
    else:
        cci = np.array([np.nan, np.nan])
        mu = np.nan
    return mu, cci, count
def calc_bootstrap(data):
    ''' Find the confidence interval for the mean of the given data set with bootstrapping. '''

    # --- >>> START stats <<< ---
    # Calculate the bootstrap
    CIs = bootstrap.ci(data=data, statfunction=sp.mean)
    # --- >>> STOP stats <<< ---

    # Print the data: the "*" turns the array "CIs" into a list
    print(
        ('The conficence intervals for the mean are: {0} - {1}'.format(*CIs)))

    return CIs
Пример #44
0
def write_data(fn,data):
   """Performs descriptive stats and writes stats to output file"""

   f = open(fn,'w')
   mue,muese = MUE(data)
   f.write("Errors are 95% CIs\n")
   f.write("MUE = %5.3f +/- %5.3f\n" % (mue,muese*1.96))
   mse,msese = MSE(data)
   f.write("MSE = %5.3f +/- %5.3f\n" % (mse,msese*1.96))
   correldict = correls(data)
   f.write("R^2 = %3.2f\n" % correldict['r_value']**2)
   f.write("K-Tau = %3.2f\n\n" % correldict['tau'])
   f.write("BOOTSTRAPPED RESULTS (10k resamples, 95% CIs)\n")
   CIs = boot.ci(data,MUE)
   f.write("MUE = %5.3f < %5.3f < %5.3f\n" % (CIs[0][0],mue,CIs[1][0]))
   CIs = boot.ci(data,MSE)
   f.write("MSE = %5.3f < %5.3f < %5.3f\n" % (CIs[0][0],mse,CIs[1][0]))
   CIs = boot.ci(data,correls_for_bootstrap)
   f.write("Pearson's R = %3.2f < %3.2f < %3.2f\n" % (CIs[0][2],correldict['r_value'],CIs[1][2]))
   f.write("R^2 = %3.2f < %3.2f < %3.2f\n" % (CIs[0][3],correldict['r_value']**2,CIs[1][3]))
   f.write("K-Tau = %3.2f < %3.2f < %3.2f\n\n" % (CIs[0][6],correldict['tau'],CIs[1][6]))
   f.close()
Пример #45
0
def scalesHiddenPlot(name='scales'):
  matplotlib.rcParams.update({'font.size' : 20})
  lw = 3
  plt.hold(True)
  if name == 'scalesPlus':
    experimentName = 'Complex'
    nLevels = 3
    leveledFcData = turk.readScalesProblems('../../data/scale_plus_6stimuli_3levels_no_fam_24_january_SCAL.csv', name)
  elif name == 'scales':
    experimentName = 'Simple'
    nLevels = 2
    leveledFcData = turk.readScalesProblems('../../data/scales_6stimuli_3levels_no_fam_25_january_OSCA.csv', name)
  else:
    print '[forcedChoiceExperiments] Unknown experiment name: ', name
  sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
  nModels = 10 # numbered 1 to 10
  agents = [] # will be an array of arrays, one entry per hidden node, one entry per training iteration
  # load the agents
  for size in sizes:
    sizeAgents = []
    for agentNum in range(1,nModels + 1):
      (listeners, speakers) = loadAllAgents('../../data/cogsci/agents-%d-%d.pickle' % (size, agentNum))
      sizeAgents.append(listeners)
    agents.append(sizeAgents)
  for (levelProblems, lineColor) in zip(leveledFcData, colors):
    dataset = forcedChoiceProblemsToDataset(levelProblems)
    hiddenLayerAccuracies = []    
    hiddenLayerScores = []
    yerrs = []
    for (allListeners, size) in zip(agents, sizes): # for each # of hidden layers
      sizeAccuracies = [] # accuracies for each independent trial for this # of hidden nodes and this level of problem. will be averaged.
      sizeScores = []
      for listeners in allListeners:
        lastListener = listeners[3] 
        (correct, activations, scores) = evalListenerOnClassificationDataset(lastListener, dataset)
        sizeAccuracies.append(float(correct) / len(scores))
        sizeScores.append(scores)
      averageAccuracy = np.array(sizeAccuracies).mean()
      hiddenLayerAccuracies.append(averageAccuracy)
      hiddenLayerScores.append(sizeScores)
      interval = boot.ci(np.array(sizeScores), np.average) 
      err = (interval[1] - interval[0])/2.0
      yerrs.append(err)
    plt.errorbar(sizes, hiddenLayerAccuracies, yerr=yerrs, linewidth=lw, color=lineColor)
  plt.axis([0, sizes[-1], 0, 1])
  plt.title('ANN Accuracy on the %s Condition' % experimentName)
  plt.xlabel('Number of Hidden Nodes')
  plt.ylabel('Average Accuracy')
  plt.legend(['Level %d' % i for i in range(nLevels)], loc='lower right')
  plt.savefig('hidden%s.pdf' % name, format='pdf')
  plt.show()
Пример #46
0
def test_bootstrap():
    import numpy as np
    from scikits.bootstrap import ci

    data = np.random.normal(loc=1, scale=1, size=1000)
    print('std = %.2f' % data.std())

    samples = bootstrap(data, 100)
    boot_error = calc_bootstrap_error(samples, 0.32)

    boot_error_ci = ci(data, np.median, 0.32)

    print('bootstrap error', boot_error)
    print('bootstrap error ci', boot_error_ci)
Пример #47
0
def test_bootstrap():
    import numpy as np
    from scikits.bootstrap import ci

    data = np.random.normal(loc=1, scale=1, size=1000)
    print('std = %.2f' % data.std())

    samples = bootstrap(data, 100)
    boot_error = calc_bootstrap_error(samples, 0.32)

    boot_error_ci = ci(data, np.median, 0.32)

    print('bootstrap error', boot_error)
    print('bootstrap error ci', boot_error_ci)
 def bootstrap(self):
     """
     performs bootrapping of f1 measure on dataset. A narrow confidence interval is more indicative of a sufficient sample size
     A 95% confidence interval means we are 95% confident that the true f1 measure is between (1) and (2).
     ( 1 and 2 are values return by bootstrap library).
     :return:
     """
     data = list(self.algorithm_results.items())
     CIs = bootstrap.ci(data=data,
                        statfunction=self.f1_bootstrap,
                        n_samples=10000)
     print(self.algorithm_name)
     print("Bootstrapped 95% confidence intervals for f1 \nLow:", CIs[0],
           "\nHigh:", CIs[1])
Пример #49
0
def compute_ci(scores, exclude=()):
    cfs = {}
    for ds, items in scores.items():
        if ds in exclude:
            continue
        cfs[ds] = {}
        for sim1, sim2 in combinations(items, 2):
            print(f'Computing CI for {ds} - {sim1} : {sim2}')
            human_scores, sim_scores1 = scores[ds][sim1]
            _, sim_scores2 = scores[ds][sim2]
            data = list(zip(human_scores, sim_scores1, sim_scores2))
            cfs[ds][(sim1, sim2)] = \
                bstrap.ci(data, statfunction=statistic, method='bca')
    return cfs
Пример #50
0
def diffusion_tensor_ci(positions, orientations, lagtime=1, fps=1., ndim=3, **kwargs):
    """Calculate the diffusion tensor and the confidence interval using bootstrap."""
    from scikits import bootstrap

    delta_tjn, all_xjn = _compute_displ(positions, orientations, lagtime, fps)
    if ndim == 2:
        all_xjn = all_xjn[:, [0, 1, 5]]  # only x, y transl and z rot

    statfunc = lambda x: (x[:, :, np.newaxis] * x[:, np.newaxis, :]).mean(0).ravel() * 0.5 / delta_tjn
    result = bootstrap.ci(all_xjn, statfunc, **kwargs)

    if ndim == 2:
        result = result.reshape((2, 3, 3))
    else:
        result = result.reshape((2, 6, 6))
    return result
Пример #51
0
def syntheticHiddenPlot():
  """ Evaluate a variety of hidden layer agents"""
  matplotlib.rcParams.update({'font.size' : 20})
  lw = 3
  plt.hold(True)
  levelInstances = [loadFacesInstances('../../data/facesInstances-%d.csv' % level) for level in [0,1,2]]
  sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
  nModels = 10 # numbered 1 to 10
  agents = [] # will be an array of arrays, one entry per hidden node, one entry per training iteration
  # load the agents
  for size in sizes:
    sizeAgents = []
    for agentNum in range(1,nModels + 1):
      (listeners, speakers) = loadAllAgents('../../data/cogsci/agents-%d-%d.pickle' % (size, agentNum))
      sizeAgents.append(listeners)
    agents.append(sizeAgents)
  # loop over levels, then over model sizes, then over agents..
  for (instances, lineColor) in zip(levelInstances, colors): # for each level
    dataset = goldListenerTrainingExamplesFromInstances(instances)
    hiddenLayerAccuracies = [] # average accuracy for each hidden layer
    hiddenLayerScores = []
    yerrs = []
    for (allListeners, size) in zip(agents, sizes): # for each # of hidden layers
      sizeAccuracies = [] # accuracies for each independent trial for this # of hidden nodes and this level of problem. will be averaged.
      sizeScores = []
      for listeners in allListeners:
        lastListener = listeners[3] 
        (correct, activations, scores) = evalListenerOnClassificationDataset(lastListener, dataset)
        sizeAccuracies.append(float(correct) / len(scores))
        sizeScores.append(scores)
      averageAccuracy = np.array(sizeAccuracies).mean()
      hiddenLayerAccuracies.append(averageAccuracy)
      hiddenLayerScores.append(sizeScores)
      interval = boot.ci(np.array(sizeScores), np.average) 
      err = (interval[1] - interval[0])/2.0
      yerrs.append(err)
    plt.errorbar(sizes, hiddenLayerAccuracies, yerr=yerrs, linewidth=lw, color=lineColor)
  plt.title('ANN Accuracy by Size of Hidden Layer')
  plt.axis([0, sizes[-1], 0, 1])
  plt.xlabel('Number of Hidden Nodes')
  plt.ylabel('Listener Accuracy')
  legendTitles = ['Level 0', 'Level 1', 'Level 2']
  plt.legend(legendTitles, loc='lower right')
  plt.savefig('hiddenSynthetic.pdf', format='pdf')
  plt.show()
Пример #52
0
def bootstrap_pce_regression(pts_filename, vals_filename,rv_trans,alpha=0.05,n_samples=3000):
    # Must be a ( num_dims x num_pts ) matrix
    pts = numpy.loadtxt( pts_filename, delimiter = ',' )
    # must be a ( num_pts x 1 ) vector
    vals = numpy.loadtxt( vals_filename, delimiter = ',' )
    vals = vals.reshape( vals.shape[0], 1 )
    #data=numpy.hstack((pts.transpose(),vals))
    
    def bootstrappable_pce_regression(pts,vals):
        ## bootstrap gives this function a tuple of arrays of shape (N,...)
        ## but PCE expects pts to be of shape (...,N), so we transpose
        pts=pts.transpose()
        num_dims, num_pts = pts.shape
        #num_dims-= 1
        #pts = data[:,range(num_dims)]
        #vals = data[:,num_dims]

         # find degree of PCE
        degree = 2
        while ( True ):
            num_basis_terms = nchoosek( degree + num_dims, num_dims )
            if ( num_basis_terms > num_pts ):
                break
            degree += 1
        degree -= 1

        # define the parameters of the PCE
        pce = PolynomialChaosExpansion()
        pce.set_random_variable_transformation( rv_trans )
        pce.define_isotropic_expansion( degree, 1. )

        # form matrices needed for normal equations
        V, build_vals = pce.build_linear_system( pts, vals,
                                                 False )
        assert V.shape[1] <= V.shape[0]

        # Solve least squares to find PCE coefficients
        coeff = numpy.linalg.solve( numpy.dot( V.T, V ),
                                    numpy.dot( V.T, build_vals ) )
        pce.set_coefficients( coeff.reshape( coeff.shape[0], 1 ) )
        return get_tsi(pce,qoi=0)
        
    TSIs=bootstrap.ci((pts.transpose(),vals),bootstrappable_pce_regression,alpha=alpha,n_samples=n_samples,multi=True)

    return TSIs
Пример #53
0
def scores_table(row_preds, row_names, y_true, score_func=matthews_corrcoef,
                 alpha=0.05):
    for preds, name in zip(row_preds, row_names):
        best_idx = best_setting(preds, y_true, score_func)
        settings, y_pred = preds[best_idx]
        print_row = "|| {} ||".format(name)
        for report_score_func in accuracy_score, f1_score, matthews_corrcoef:
            score = report_score_func(y_true, y_pred)
            if score == 0:
                score_low, score_hi = 0, 0
            else:
                score_low, score_hi = ci((y_true, y_pred), report_score_func,
                                          alpha=alpha, n_samples=5000,
                                          method='bca')
            print_row += "{:.2f} ({:.2f}-{:.2f}) ||".format(score,
                                                            score_low,
                                                            score_hi)
        print(print_row)
Пример #54
0
def syntheticPlot(allAccuracies, allScores, outFile, title, errorBars=False, overall=False):
  """Generate figure of accuracy accross listeners and datasets.
     accuracy is an array of np arrays. each np array has the accuracy for a given level of each model, where the 0th is the literal one.
     If overall == True, use the last entry labeled as Overall. Otherwise, label the scores by their level name.
  """
  matplotlib.rcParams.update({'font.size' : 20})
  lw = 3
  plt.hold(True)
  for levelAccuracies, levelScores, lineColor in zip(allAccuracies, allScores, colors):
    if errorBars:
      yerrs = []
      for scores in levelScores: # one per each level
        if np.array(scores).all():
          yerrs.append(0)
        else:
          interval = boot.ci(np.array(scores), np.average) 
          err = (interval[1] - interval[0])/2.0
          yerrs.append(err)
      plt.errorbar(range(len(levelAccuracies)), levelAccuracies, yerr=yerrs, linewidth=lw, color=lineColor)
    else:
      plt.plot(levelAccuracies, linewidth=lw, marker='o', color=lineColor)
  nListeners = len(allAccuracies[0]) # number of models
  nLevels = len(allAccuracies) # types of problems
  plt.axis([0, nListeners - 1, 0, 1])
  plt.ylabel('Listener Accuracy')
  plt.xlabel('Training Iterations')
  if overall:
    legendTitles = ['Level %d' % level for level in range(nLevels - 1)]
    legendTitles.append('Overall')
  else:
    legendTitles = ['Level %d' % level for level in range(nLevels)]
    
  plt.legend(legendTitles, loc='lower right')
  plt.title(title)
  plt.savefig(outFile, format='pdf')
  plt.show()
Пример #55
0
# X_LOSO = rest_data[subject_subset,:]

# expVar(beh_keysfn, X_LOSO, Y_LOSO, penalty)

# LOSO_loadings = SCCA_r(X_LOSO, Y_LOSO, n_components, penalty)
# SCCA_Output_Sheet('SCCA_LOSO', region_labels_fn, beh_keysfn, subject_subset, X, Y, LOSO_loadings)
# np.save('SCCAloading_LOSO_long',LOSO_loadings)

#################LOSO COMPLETE#################
import scikits.bootstrap as boot
data = (X, Y)
def SCCA_boot(X,Y):
    loadings = SCCA_r(X,Y, 6, (0.3,0.5))
    
    return True
ci_test = boot.ci(data, statfunction=SCCA_boot) 

boot_loadings = np.load(expanduser('bootstrap_all_comp_long.npy'))


SCCA_Output_Sheet('SCCA_Bootstrap_long', region_labels_fn, beh_keysfn, subject_subset, X, Y, boot_loadings)


from numpy import genfromtxt
data_by_task = genfromtxt('Behavioural\\mwq_byTask.csv', delimiter=',',skip_header=1)
data_CRT = data_by_task[:,1:14]
data_WM = data_by_task[:,14:]
subject_subset = data_by_task[:, 0].astype('i4')
loadings = boot_loadings[1]
comp = np.zeros((len(data_CRT), loadings.shape[1]*2))
for i in range(loadings.shape[1]): 
high = 1000

fig = plt.figure(1, facecolor='white', figsize=(7,5.6))

trueProb = 0.5
xs = np.arange(stride, high+stride, stride)
ys = np.ones_like(xs)*trueProb
yerr_low = np.zeros_like(xs, dtype=np.float)
yerr_high = np.zeros_like(xs, dtype=np.float)
i = 0
for x in xs:
    crossPos = x*trueProb
    crossNeg = x-crossPos
    transitions = np.concatenate([np.ones(crossPos), np.ones(crossNeg)*-1],
            axis=0)
    CI = skbootstrap.ci(data=transitions, statfunction=getRho, 
            output='errorbar', n_samples=10000, method='pi') 
    print x, CI
    yerr_low[i] = CI[0,0]
    yerr_high[i] = CI[1,0]
    i+=1

ax1 = fig.add_subplot(111)
ax1.margins(0,0.05)
ax1.errorbar(xs, ys, yerr=[yerr_low, yerr_high], ecolor='r', 
        color='k', fmt='o', elinewidth=2, capthick=2)
ax1.set_ylabel(r'Probability')
ax1.set_xlabel(r'Number of Samples')
ax1.set_xlim([0,high])
ax1.set_ylim([0,1])

Пример #57
0
        continue
    if topic == 'all':
        break
    if measure not in sc:
        sc[measure] = {}
    sc[measure][topic] = float(score)

for measure in measures:
    values = np.fromiter(sc[measure].values(), np.float)
    mean = values.mean()

    lo, hi = ci.t_ci_mean(values)
    shap = ci.shape(mean, lo, hi)
    cover = ci.coverage(values, ci.t_ci_mean)
    print('{} t {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover))

    lo, hi = ci.bootstrap_t_ci_mean(values)
    shap = ci.shape(mean, lo, hi)
    cover = ci.coverage(values, ci.bootstrap_t_ci_mean)
    print('{} bootstrap_t {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover))

    lo, hi = ci.bootstrap_pct_ci_mean(values)
    shap = ci.shape(mean, lo, hi)
    cover = ci.coverage(values, ci.bootstrap_pct_ci_mean)
    print('{} bootstrap_pct {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover))

    lo, hi = bootstrap.ci(values, n_samples=2000)
    shap = ci.shape(mean, lo, hi)
    cover = ci.coverage(values, lambda x: bootstrap.ci(x, n_samples=2000))
    print('{} sk.bs-bca {:.4f} [{:.4f},{:.4f}] {:.2f} {:.3f}'.format(measure, values.mean(), lo, hi, shap, cover))
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

rnm = infile
rg = open(rnm)
line = rg.readline()
rg.close()
line1 = line[:-4].split(')) ')
res = []
n_samples = len(line1[0].split(') (')[0].split(' '))
fnl = np.zeros((queries,conditions,n_samples))
for a,i in enumerate(line1): 
    for b,j in enumerate(i.split(') (')): # conditions
        fnl[a,b,:] = np.array(j.translate(None,'(').split(' ')) #samples


fnlc = np.zeros((fnl.shape[0:2]))
fnlc = []

# organize as 1-6 Strength, 7-12 Luck
for a in range(queries):
    for b in range(conditions):
        fnlc.append(bootstrap.ci(fnl[a,b]))

np.savetxt((rnm[:-8]+'_95ci.csv'), fnlc, delimiter=',')
def sliding_median_iqr(neighbors, random=None, compute_random=0, window=1000, p0=None):
    """
    Compute sliding median of spearmanr and size, interquartile range 
    and 95% CI of spearmanr of randomly paired genes

    Parameters
    ----------
    neighbors: neighboring gene pairs dataframe
    window: size of window for sliding median

    Returns
    -------
    rolling_median: sliding median of spearmanr and size with IQR for spearmanr
    median and 95% confidence interval of median from random pairs

    """
    #load dataframe if not provided yet
    if isinstance(neighbors , basestring): 
        neighbors  = pd.read_csv(neighbors)
    if compute_random and isinstance(random , basestring): 
        random  = pd.read_csv(random)

    # sort by size to do sliding window with increasing intergenic distance
    # nans cause error in sliding median
    neighbors  = neighbors.sort('size').dropna()

    print 'computing sliding median...'
    # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy.
    rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window)

    print 'computing IQR...'
    # compute interquartile range (IQR). Top 75% and bottom 25%.
    rolling_spearmanr_q1 =  - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \
            rolling_median_spearmanr 
    rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \
            rolling_median_spearmanr 
    rolling_median_size = pd.rolling_median(neighbors['size'], window)/1000

    # put it all together
    rolling_median_s = pd.DataFrame({'spearmanr': rolling_median_spearmanr, 
        'size':rolling_median_size, 'q1': rolling_spearmanr_q1, 'q3': rolling_spearmanr_q3})

    # drop all nans from sliding median (first 1000 because of window)
    rolling_median_s = rolling_median_s.dropna()

    # reindex is necessary
    rolling_median_s.index = np.arange(len(rolling_median_s))

    if compute_random:
        print 'computing random pairs median CI'
        # compute 95% confidence interval of median in random pairs
        ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median)
        rolling_median_s['random_lci'] = ci_median[0]
        rolling_median_s['random_hci'] = ci_median[1]

    print 'fitting to exp decay...'
    popt_s, pcov_s = curve_fit(exp_decay, rolling_median_s['size'], 
            rolling_median_s.spearmanr, p0=p0)

    rolling_median_s['popt1'] = popt_s[0]
    rolling_median_s['popt2'] = popt_s[1]
    rolling_median_s['popt3'] = popt_s[2]

    print 'done'
    return rolling_median_s