Пример #1
1
def histo_pdfs(x_test,y_test,x_train=None,y_train=None):
  """
  Plot the histograms of the training and test set superimposed with PDFs.
  """
  from scipy.stats.kde import gaussian_kde

  num_t = np.unique(y_test.NumType.values.ravel())
  num_t = map(int,list(num_t))
  str_t = np.unique(y_test.Type.values.ravel())
  str_t = map(str,list(str_t))

  fig = plt.figure()
  fig.set_facecolor('white')

  colors = ('k','w')
  for feat in x_test.columns:
    hist = []
    g = {}

    if y_train:
      c_train = ('b','g')
      hist_train, g_train = [],{}
      lab_tr = []
      mini = np.min([x_test.min()[feat],x_train.min()[feat]])
      maxi = np.max(x_test.max()[feat],x_train.max()[feat])
    else:
      mini = x_test.min()[feat]
      maxi = x_test.max()[feat]

    print mini, maxi
    bins_hist = np.linspace(mini,maxi,25)
    bins = np.linspace(mini,maxi,200)

    for i in num_t:
      index = y_test[y_test.NumType.values==i].index
      x_plot = x_test.reindex(columns=[feat],index=index).values
      hist.append(x_plot)
      kde = gaussian_kde(x_plot.ravel())
      g[i] = kde(bins)
      if y_train:
        lab_tr.append('%s (train)'%str_t[i])
        index = y_train[y_train.NumType.values==i].index
        x_plot = x_train.reindex(columns=[feat],index=index).values
        hist_train.append(x_plot)
        kde = gaussian_kde(x_plot.ravel())
        g_train[i] = kde(bins)
    
    plt.hist(hist,bins=bins_hist,color=colors,normed=1,histtype='stepfilled',alpha=.2,label=str_t)
    if y_train:
      plt.hist(hist_train,bins=bins_hist,color=c_train,normed=1,histtype='stepfilled',alpha=.2,label=lab_tr)

    colors = ('k','y')
    for key in sorted(g):
      plt.plot(bins,g[key],color=colors[key],lw=2.)
      if y_train:
        plt.plot(bins,g_train[key],color=c_train[key],lw=1.,ls='--')
 
    plt.legend(loc=2)
    plt.xlabel(feat)
Пример #2
0
def count_to_kde(D, KDE_SENSORS):
	problemct = 0
	totalct = 0
	for id in D.keys():
		for sensor in D[id]:
			if sensor in KDE_SENSORS:
				training = []
				maxkey = None
				maxcnt = None
				totvals = float( sum( D[id][sensor].values())  )
				for key in D[id][sensor].keys():
					coef = int( 1000* D[id][sensor][key]/totvals )
					training += ( [key] * coef )
				if len(training) > 0:
					# we have sufficient training data
					try:
						density = kde.gaussian_kde( training )
					except: # Most likely a singular matrix (e.g. [n, n, ... ,n]). Add a little 'noise' to break singularity.
						training += [0.01] * 1
						density = kde.gaussian_kde( training )
					maxval = None
					maxkey = None
					for key in D[id][sensor].keys():
						dk = density(key)[0]
						if maxval is None or maxval < dk:
							maxval = dk
							maxkey = key
					D[id][sensor] = (density, maxval)
				else:
					# we don't have sufficient training data
					D[id][sensor] = None
	return D
Пример #3
0
def graph_FWHM_data_range(start_date=datetime.datetime(2015,3,6),
                          end_date=datetime.datetime(2015,4,15),tenmin=True,
                          path='/home/douglas/Dropbox (Thacher)/Observatory/Seeing/Data/',
                          write=True,outpath='./'):
    
    
    plot_params()
    fwhm = get_FWHM_data_range(start_date = start_date, end_date=end_date, path=path, tenmin=tenmin)

    # Basic stats
    med = np.median(fwhm)
    mean = np.mean(fwhm)
    fwhm_clip, low, high = sigmaclip(fwhm,low=3,high=3)
    meanclip = np.mean(fwhm_clip)

    # Get mode using kernel density estimation (KDE)
    vals = np.linspace(0,30,1000)
    fkde = gaussian_kde(fwhm)
    fpdf = fkde(vals)
    mode = vals[np.argmax(fpdf)]
    std = np.std(fwhm)


    plt.ion()
    plt.figure(99)
    plt.clf()
    plt.hist(fwhm, color='darkgoldenrod',bins=35)
    plt.xlabel('FWHM (arcsec)',fontsize=16)
    plt.ylabel('Frequency',fontsize=16)
    plt.annotate('mode $=$ %.2f" ' % mode, [0.87,0.85],horizontalalignment='right',
                 xycoords='figure fraction',fontsize='large')
    plt.annotate('median $=$ %.2f" ' % med, [0.87,0.8],horizontalalignment='right',
                 xycoords='figure fraction',fontsize='large')
    plt.annotate('mean $=$ %.2f" ' % mean, [0.87,0.75],horizontalalignment='right',
                 xycoords='figure fraction',fontsize='large')

    xvals = np.linspace(0,30,1000)
    kde = gaussian_kde(fwhm)
    pdf = kde(xvals)
    dist_c = np.cumsum(pdf)/np.sum(pdf)
    func = interp1d(dist_c,vals,kind='linear')
    lo = np.float(func(math.erfc(1./np.sqrt(2))))
    hi = np.float(func(math.erf(1./np.sqrt(2))))

    disthi = np.linspace(.684,.999,100)
    distlo = disthi-0.6827
    disthis = func(disthi)
    distlos = func(distlo)

    interval = np.min(disthis-distlos)

    plt.annotate('1 $\sigma$ int. $=$ %.2f" ' % interval, [0.87,0.70],horizontalalignment='right',
                 xycoords='figure fraction',fontsize='large')
    
    
    plt.rcdefaults()

    plt.savefig(outpath+'Seeing_Cumulative.png',dpi=300)

    return
Пример #4
0
def IndivisualDistributionsPlot():
    path = '/Users/ryszardcetnarski/Desktop/Distributions/'
    plt.style.use('ggplot')
    db = LoadDatabase()
    rest = prep.Load_rest()
    kde_bandwith = 0.8



    #Vector for plotting
    for name, subject in db.groupby(db.index):
        fig = plt.figure()
        fig.suptitle(name, fontweight ='bold')

        bands = ['all_spectrum', 'alpha', 'beta1', 'beta2']
        ax = []
        for idx,band in enumerate(bands):
            ax.append(fig.add_subplot(220+idx+1))

            training = ExtractBands(subject, 'training', band)
            baseline = ExtractBands(subject.dropna(subset = ['baseline_bands']), 'baseline', band)

            training_distribution = gaussian_kde(training, kde_bandwith)
            baseline_distribution = gaussian_kde(baseline, kde_bandwith)

            ax[idx].hist(training , alpha = 0.2, normed = True, color = 'blue')
            ax[idx].hist(baseline , alpha = 0.2, normed = True, color = 'yellow')


            if name in rest:
                ax[idx].axvline(rest[name]['Before'].loc[band].mean(), color = 'b', linestyle = 'dashed', linewidth = 2, label = 'rest przed')
                ax[idx].axvline(rest[name]['After'].loc[band].mean(), color = 'r', linestyle = 'dashed', linewidth = 2, label = 'rest po')
                # ax[idx].axvline(0, color = 'b', linestyle = 'dashed', linewidth = 2, label = 'rest przed')
                # ax[idx].axvline(0, color = 'r', linestyle = 'dashed', linewidth = 2, label = 'rest po')

            else:
                print(name)

            xmin, xmax = ax[idx].get_xlim()
            x = np.linspace(xmin-1, xmax+1, 100)
#
            ax[idx].plot(x , training_distribution(x), color = 'blue', label ='dystrybucja trening')
            ax[idx].plot(x , baseline_distribution(x), color = 'yellow', label ='dystrybucja baseline')

            ax[idx].set_title(band)
            if(idx == 3):
                ax[idx].legend(loc = 'best')



        fig.savefig(path + name +'.png', dpi = 400)

        #break
       #rest.loc[name]['Before'].loc['alpha'])
       # return rest.loc[name]['Before'].loc['alpha']
    plt.tight_layout()
Пример #5
0
 def analyse(self):
    """Analyse : gets the peaks of the pdf."""
    if self.transpose=="Yes":
        self.freqtransmode = self.transmode()
        print self.file_name,"(transposed)"
        self.pdf = gaussian_kde(self.freqtransmode[~numpy.isnan(self.freqtransmode)],self.bw_method)
    if self.transpose=="No":
        print self.file_name,"(not transposed)"
        self.pdf = gaussian_kde(self.freq[~numpy.isnan(self.freq)],self.bw_method)
    self.pdf = self.pdf(self.x)
    
    self.peaks()
Пример #6
0
def mutualInformation(X,Y):
    # Use a gaussian kernel estimator to approximate the pdfs
    pX = gaussian_kde(X)
    pY = gaussian_kde(Y)
    # Estimate joint pdf
    pXY = gaussian_kde([X,Y])
    # Use estimated distributions to approx. entropies
    sX = entropy(pX.evaluate(X))
    sY = entropy(pY.evaluate(Y))
    sXY = entropy(pXY.evaluate([X,Y]))
    # Calculate and return mutual information between X and Y
    MI = sX + sY - sXY
    return MI
	def trainMLE(self,trainingDict):
		"""docstring for trainMLE"""
		trainingValues = []
		for v in self.varorder:
			trainingValues.append(trainingDict[v])
		trainingValues = np.array(trainingValues)
		for variable in self.variables:
			# indice of current variable
			varindex = self.varorder.index(variable)
			# indexes of parent variables
			indexes = [self.varorder.index(j) for j in self.variables if j in self.parents[variable]]
			indexes.sort()
			self.singlekdes[variable] = gaussian_kde(trainingValues[varindex])
		# now we need to know how to use the joint probability distribution!
		self.jointkde = gaussian_kde(trainingValues)
Пример #8
0
 def calc_dens(self):      
    kde_np_peak  = gaussian_kde( self.np_peak )
    self.np_peak[ self.np_peak>self.x_lim2 ] = self.x_lim2
    
    kde_np_norm  = gaussian_kde( self.np_norm,bw_method=0.05 /  self.np_norm.std(ddof=1) )
    self.np_norm[ self.np_norm>self.x_lim2 ] = self.x_lim2
    
    dist_space = np.linspace( 0, self.x_lim2, 100 )
    x      = dist_space
    y_peak = kde_np_peak(dist_space)
    y_norm = kde_np_norm(dist_space)
    
    data = { 'x':x , 'y_peak':y_peak,  'y_norm':y_norm }
    pd_frame = pd.DataFrame( data )
    pd_frame.to_csv( self.outXls,sep="\t" )
Пример #9
0
def plot_histos_and_pdfs_kde(axHistx,axHisty,bins_x,bins_y,x_test,y_test,x_train=None,y_train=None):
    """
    Histograms and KDE PDFs
    """
    x_hist, y_hist = [],[]
    g_x, g_y = {}, {}
    if y_train:
      g_x_train, g_y_train = {}, {}

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1] 
    NB_class = len(np.unique(y_test.Type.values))

    if NB_class > 2:
      colors = ('k','gray','w')
    elif NB_class == 2:
      colors = ('k','w')
    for i in range(NB_class):
      index = y_test[y_test.NumType.values==i].index
      x1 = x_test.reindex(columns=[feat_1],index=index).values
      x2 = x_test.reindex(columns=[feat_2],index=index).values
      x_hist.append(x1)
      y_hist.append(x2)
      kde = gaussian_kde(x1.ravel())
      g_x[i] = kde(bins_x)
      kde = gaussian_kde(x2.ravel())
      g_y[i] = kde(bins_y)
      axHisty.hist(x2,bins=bins_y,color=colors[i],normed=1,orientation='horizontal',histtype='stepfilled',alpha=.5)
      if y_train:
        index = y_train[y_train.NumType.values==i].index
        x1 = x_train.reindex(columns=[feat_1],index=index).values
        x2 = x_train.reindex(columns=[feat_2],index=index).values
        kde = gaussian_kde(x1.ravel())
        g_x_train[i] = kde(bins_x)
        kde = gaussian_kde(x2.ravel())
        g_y_train[i] = kde(bins_y)
    axHistx.hist(x_hist,bins=bins_x,color=colors,normed=1,histtype='stepfilled',alpha=.5)

    if NB_class > 2:
      colors = ('y','orange','r')
    elif NB_class == 2:
      colors = ('y','r')
    for key in sorted(g_x):
      axHistx.plot(bins_x,g_x[key],color=colors[key],lw=2.)
      axHisty.plot(g_y[key],bins_y,color=colors[key],lw=2.)
      if y_train:
        axHistx.plot(bins_x,g_x_train[key],color=colors[key],lw=1.,ls='--')
        axHisty.plot(g_y_train[key],bins_y,color=colors[key],lw=1.,ls='--')
Пример #10
0
  def compute_pdfs(self):

    """
    Compute the Probability Density Functions (PDFs) for all features and all event types.
    """

    from scipy.stats.kde import gaussian_kde

    self.types = np.unique(self.y.Type.values)

    dic={}
    for t in self.types:
      dic[t] = self.x[self.y.Type==t]

    self.gaussians = {}
    for feat in self.opdict['feat_list']:
      vec = np.linspace(self.x.min()[feat],self.x.max()[feat],200)
      #vec = np.linspace(self.x.min()[feat]+self.x.std()[feat],self.x.max()[feat]-self.x.std()[feat],200)
      #vec = np.linspace(self.x.mean()[feat]-self.x.std()[feat],self.x.mean()[feat]+self.x.std()[feat],200)

      self.gaussians[feat] = {}
      self.gaussians[feat]['vec'] = vec

      for it,t in enumerate(self.types):
        if len(dic[t][feat].values) > 1:
          if feat != 'NbPeaks':
            kde = gaussian_kde(dic[t][feat].values)
            a = np.cumsum(kde(vec))[-1]
            self.gaussians[feat][t] = kde(vec)/a
          else:
            self.gaussians[feat][t] = dic[t][feat].values
Пример #11
0
 def make_plot(self):
     c_map = ['#268bd2', '#cb4b16',]
     fig = plt.figure()
     fig.patch.set_alpha(0)
     ax = fig.add_subplot(111)
     
     ax.set_xlabel('log$_{10}$(FPKM)')
     ax.set_ylabel('Density')
     ax.title.set_fontsize(18)
     
     for i,sample in enumerate(self.exp.sample_set.all()):
         df = self.get_dataframe(sample)[self.data_fields[0]]
         df = df[df > 0]
         df = df.map(math.log10)
         base = np.linspace(min(df), max(df), 200)
         kde = gaussian_kde(df)
         kde_pdf = kde.evaluate(base)
         ax.plot(base, kde_pdf,
             color=c_map[i],
             label=sample.sample_name,
             alpha=0.8)
         ax.fill_between(base, kde_pdf, color=c_map[i], alpha=0.4)
     ax.legend()
     rstyle(ax)
     return fig
Пример #12
0
def kde(freqs):
    """Estimate the pdf of the freqs using a Kernel Density Estimation.
    
    The estimation is done on the frequencies (0,500).
    
    Args:
        freqs (numpy.ndarray) : A list of frequencies in Hz.
        
    Returns:
        pdf (scipy.stats.kde.gaussian_kde) : the pdf function of freqs.
    
    Exemple:
        >>> from music22 import diastema,scale
        >>> import matplotlib.pyplot as plt
        
        >>> file_path = "/Users/anas/AUDIO/Barraq/txt/P0.txt"
        >>> freqs = numpy.loadtxt(file_path)
        >>> freqs = music22.core.clean_list(freqs)
        >>> pdf = music22.scale.kde(freqs)
        >>> plt.plot(pdf)
        >>> plt.show()
    """
    global x, bw_method
    kde = gaussian_kde(freqs,bw_method)
    pdf = kde.evaluate(x)
    return pdf
Пример #13
0
def FWHM_stats(data,all=True,clip=False):
    """
    Description:
    ------------
    Return basic FWHM stats
    """
    
    if all:
        fwhm =  FWHM_all(data)
    elif clip:
        fwhm = FWHM_ave(data,clip=clip)
    else:
        fwhm = FWHM_ave(data)


    # Basic stats
    med = np.median(fwhm)
    mean = np.mean(fwhm)
    fwhm_clip, low, high = sigmaclip(fwhm,low=3,high=3)
    meanclip = np.mean(fwhm_clip)

    # Get mode using kernel density estimation (KDE)
    vals = np.linspace(0,30,1000)
    fkde = gaussian_kde(fwhm)
    fpdf = fkde(vals)
    mode = vals[np.argmax(fpdf)]

    std = np.std(fwhm)
    
    return [mean,med,mode,std,meanclip]
Пример #14
0
def density_at_points(data):
    """Use KDE to calculate the probability density at each point in a dataset.

    Useful for coloring points in scatterplot by the density, to better help
    visualize crowded regions of the plot.

    Parameter:
        data: array of shape (n_data_points, n_dimensions)

    Returns:
        densities: array of shape (n_data_points)

    Example:
        import numpy
        import matplotlib.pyplot as plt
        # prepare some data
        mode1 = numpy.random.multivariate_normal(mean=[0, 0], cov=[[4, 1], [1, 7]], size=300)
        mode2 = numpy.random.multivariate_normal(mean=[8, 8], cov=[[2, 1], [1, 1]], size=300)
        data = numpy.concatenate([mode1, mode2], axis=0)

        # calculate the contours
        density = density_at_points(data)

        # plot the data
        plt.scatter(data[:,0], data[:,1], s=12, c=density, cmap='inferno')
    """
    data = numpy.asarray(data)
    kd = kde.gaussian_kde(data.T)
    return kd(data.T)
Пример #15
0
    def get_chart_image(self):

        fig = pylab.figure()

        for attribute in self.chartdata:
            if self.histogram:
                try:
                    pylab.hist(self.chartdata[attribute],
                               bins=100,
                               normed=self.normalized)
                except:
                    print("Warning: problem rendering attribute histogram graph.")
                    print(self.chartdata[attribute])
            if self.points:
                try:
                    pylab.scatter(self.chartdata[attribute], zeros(len(self.chartdata[attribute])))
                except:
                    print("Warning: problem rendering attribute distribution scattar graph.")
            if self.kde:
                try:
                    x_axis = linspace(self.minval, self.maxval, 1000)
                    approx_dist = gaussian_kde(self.chartdata[attribute])
                    pylab.plot(x_axis, approx_dist(x_axis))
                except:
                    print("Warning: problem rendering attribute distribution kde graph.")
                    print("min: " + str(self.minval) + ", max: " + str(self.maxval))
                    print("linspace:" + x_axis)

        chart_image = StringIO.StringIO()
        fig.canvas.print_figure(chart_image, dpi=80)
        return chart_image.getvalue()
	def __init__(self,sim_phone,classifier_model,power_model,callback_list) :
		self.sim_phone=sim_phone
		self.classifier_output=[]
		self.callback_list = callback_list
		self.ewma_window = [0.2]*5

		self.wifi_distribution[0]= Positive_Normal(138.4186,17.2395)
		self.wifi_distribution[1]= Positive_Normal(85.9490,14.2045)
		self.wifi_distribution[2]= Positive_Normal(76.9451,13.8514)
		self.wifi_distribution[3]= Positive_Normal(83.0392, 8.4357)
		self.wifi_distribution[4]= Positive_Normal(32.9173, 32.1233)
		
		self.gps_distribution[0] = Positive_Normal(0.05,0.2)
		self.gps_distribution[1] = Positive_Normal(1.4450, 0.5919)
		self.gps_distribution[2] = Positive_Normal(3.2262, 0.4802)
		self.gps_distribution[3] = Positive_Normal(3.3806, 1.1705)
		self.gps_distribution[4] = Positive_Normal(12.5267, 7.55964)

		for i in range(5):
			print self.wifi_distribution[i]

		hard_act_counter = 0
		for callback in self.callback_list:
			if callback == 0 or callback == 3 or callback == 4:
				hard_act_counter += 1

	#	if hard_act_counter == 1:
	#		self.use_wifi = 1
	#	if hard_act_counter >= 1:
		self.use_gps = 1

		''' set initial sampling intervals in milliseconds '''
		execfile(power_model)
		
		self.current_sampling_interval=max(self.power_accel.keys())
		sim_phone.change_accel_interval(max(self.power_accel.keys()))
		if self.use_wifi == 1:
			sim_phone.change_wifi_interval(60000)
		else:
			sim_phone.change_wifi_interval(10000000000)
		if self.use_gps == 1:
			sim_phone.change_gps_interval(60000)
		else:
			sim_phone.change_gps_interval(10000000000)

		sim_phone.change_gsm_interval(max(self.power_gsm.keys()))
		sim_phone.change_nwk_loc_interval(max(self.power_nwk_loc.keys()))
		
		classifier_model_handle=open(classifier_model,"r");
		self.feature_list = pickle.load(classifier_model_handle);


		for i in range(5):
			self.kernel_function[i] = []
			for j in range(len(self.feature_list[i])):
				kernel_pdf = gaussian_kde(self.feature_list[i][j])
				#kernel_pdf.covariance_factor = lambda : 0.
				#kernel_pdf._compute_covariance()
				self.kernel_function[i] += [kernel_pdf]
		self.feature_list = []
def lericsonPlot(X, x, T):
    __import__('mpl_toolkits.mplot3d')
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.view_init(55, 45)
    #bins = np.linspace(np.min(X), np.max(X), 100)
    bins = np.linspace(-12, 12, 100)
    ax.set_xlim3d(0, T)
    ax.set_ylim3d(bins[0], bins[-1])
    ax.set_zlim3d(0, 1)
    ax.set_xlabel('$t$')
    ax.set_ylabel('$x_T$')
    ax.set_zlabel('$p(x_T | y_{1:T})$')

    for t in range(0, len(x)):
        if t%8 is not 0 or t<20:
            continue
        density = gaussian_kde(X[t])
        xs = np.linspace(min(X[t]),max(X[t]),200)
        eggs = xs
        density.covariance_factor = lambda : .25
        density._compute_covariance()   
        #plt.plot(xs,density(xs))
        ax.plot([t]*len(eggs),eggs, density(xs))
        index = min(range(len(eggs)), key=lambda i: abs(eggs[i]-x[t]))
        print density(xs)[index]
        ax.scatter(t, x[t], density(xs)[index])
    plt.tight_layout()
    plt.show()
Пример #18
0
def scatter_kde(x, y, ax=None):
    if ax is None:
        ax = pl.gca()

    # build kernel density estimator (KDE)
    kde = gaussian_kde(np.array(zip(x,y)).T)

    ax.scatter(x, y, alpha=0.5, color='white')

    # top right bottom left
    t = y.max()
    r = x.max()
    b = y.min()
    l = x.min()

    # Regular grid to evaluate kde upon
    n = 128
    x_flat = np.r_[l:r:n*1j]
    y_flat = np.r_[b:t:n*1j]

    g = np.array(np.meshgrid(x_flat, y_flat)).reshape(2,n*n)

    # evaluate the KDE at grid points
    z = kde(g).reshape(n,n)

    ax.imshow(z,
              aspect=x_flat.ptp()/y_flat.ptp(),
              origin='lower',
              extent=(l,r,b,t))

    return ax
Пример #19
0
def kde_plot(x):
        from scipy.stats.kde import gaussian_kde
        kde = gaussian_kde(x)
        positions = np.linspace(x.min(), x.max())
        smoothed = kde(positions)
        plt.figure()
        plt.plot(positions, smoothed)
Пример #20
0
Файл: kde.py Проект: aronwc/quac
 def build(self):
    self.allpoints = build_token_location_map_transpose(self.token_iterator)[1]
    l.debug('fitting kernel density estimate to %d points...' % 
            len(self.token_iterator))
    t1 = time.time()
    self.global_model = kde.gaussian_kde(self.allpoints)      
    l.debug('...finished in %0.3f s.' % (time.time() - t1))
Пример #21
0
def determine_kde(data, 
                  size_kde=1000,
                  ymin=None,
                  ymax=None):
    '''
    
    Helper function responsible for performing a KDE    
    
    :param data:
    
    
    '''
    if not ymin:
        ymin = np.min(data)
    if not ymax:
        ymax = np.max(data)
    
    kde_y = np.linspace(ymin, ymax, size_kde)
    
    try:
        kde_x = kde.gaussian_kde(data)
        kde_x = kde_x.evaluate(kde_y)
#         grid = GridSearchCV(KernelDensity(kernel='gaussian'),
#                             {'bandwidth': np.linspace(ymin, ymax, 20)},
#                             cv=20)
#         grid.fit(data[:, np.newaxis])
#         best_kde = grid.best_estimator_
#         kde_x = np.exp(best_kde.score_samples(kde_y[:, np.newaxis]))
    except Exception as e:
        warning(e)
        kde_x = np.zeros(kde_y.shape)
    
    return kde_x, kde_y
Пример #22
0
def determine_kde(data, 
                  size_kde=1000,
                  ymin=None,
                  ymax=None):
    '''
    
    Helper function responsible for performing a KDE    
    
    :param data:
    
    
    '''
    if not ymin:
        ymin = np.min(data)
    if not ymax:
        ymax = np.max(data)
    
    kde_y = np.linspace(ymin, ymax, size_kde)[::-1]
    
    try:
        kde_x = kde.gaussian_kde(data)
        kde_x = kde_x.evaluate(kde_y)
    except np.linalg.LinAlgError as e:
        warning(e)
        kde_x = np.zeros(kde_y.shape)
    
    return kde_x, kde_y
Пример #23
0
def score(aPDB,aFASTA,exe=None,logf=None):
    
    ''' Gets alignment score irregardless of alignment method. '''
    
    scores = []
    
    # Get PDB structure.
    p = PDBnet.PDBstructure(aPDB)
    
    # Get length of alignment.
    alignlen = len(p)
    
    # See what scores need to be done.
    if exe:
        scoresToDo = exe.scoresToDo
        if not scoresToDo: scoresToDo = SCORE_TYPES
    else: scoresToDo = SCORE_TYPES
    rrmsd,rpval,rmsd,tmsc,tpval,gdt = None,None,None,None,None,None
    
    # Get RRMSD and RMSD if length of alignment >= 100 residues.
    if 'RRMSD' in scoresToDo or 'RMSD' in scoresToDo:
        rrmsd, rmsd = homology.rrmsd(aPDB,aFASTA,True)
        if not exe or not exe.scpdbs or alignlen >= 100:
            rpval = 1 - truncnorm.sf(rrmsd, 0, 1, loc=0.177, scale=0.083)#normpdf(rrmsd,0.177,0.083)
        elif exe and logf and 'RRMSD' in scoresToDo:
            # Perform alignments in order to generate null distribution.
            logf.setTotalNum(logf.totalnum+2*(len(pdbli)+1))
            logf.writeTemporary(
                'Generating null distribution from SCOP for %s...' % (aPDB))
            scfolders = []
            run(exe.scpdbs,logf,ref=aPDB,exe=exe,quick=None)
            alignfldr = IO.getFileName(aPDB)
            o = open('%s/ref.pickl' % (alignfldr))
            dic, _, _ = cPickle.load(o)
            vals = dic.values()
            o.close()
            pdf = gaussian_kde(vals)
            rpval = pdf(rrmsd)
    
    # Get GDT and TMscore.
    if 'TMscore' in scoresToDo:
        tmsc = p.tmscore(aFASTA)
        tpval = 1 - math.exp(-math.exp((0.1512-tmsc)/0.0242))
         
    if 'GDT' in scoresToDo:
        gdt = p.gdt(aFASTA)
        
    # Add them to list in order as given.
    for it in scoresToDo:
        if it == 'RRMSD':
            scores.append(alignmentScore('RRMSD',rrmsd,rpval))
        elif it == 'RMSD':
            scores.append(alignmentScore('RMSD',rmsd))            
        elif it == 'TMscore':
            scores.append(alignmentScore('TMscore',tmsc,tpval))   
        elif it == 'GDT':
            scores.append(alignmentScore('GDT',gdt))
    
    # Return the scoring values.
    return scores
def pdf_estimation(x):
    density, xgrid, xarr = [], [], []
    for i in range(len(x)):
        density.append((kde.gaussian_kde(x[i])))
        xgrid.append(np.linspace(min(x[i]), max(x[i]), len(x[i])))
        xarr.append(x[i])
    return density,xgrid,xarr
Пример #25
0
    def histogram(self, index, **options):
        data = self.datafile.data()[:, index]
        # (name, min, max, nuisance, prior)
        parameter = self.datafile.parameters[index]

        plot.clf()
        plot.figure(figsize=(10, 10), dpi=80)

        # x axis
        plot.xlabel(parameter[0])
        xmin = options['xmin'] if options['xmin'] != None else parameter[1]
        xmax = options['xmax'] if options['xmax'] != None else parameter[2]
        plot.xlim(xmin, xmax)

        # y axis
        plot.ylabel('frequency')

        # plot
        plot.hist(data, bins=100, normed=1, alpha=.3)
        if options['kde']:
            kde = gaussian_kde(data)
            kde.set_bandwidth(bw_method='silverman')
            kde.set_bandwidth(bw_method=kde.factor * options['kde_bandwidth'])
            x = numpy.linspace(xmin, xmax, 1000)
            plot.plot(x, kde(x), 'r')

        plot.tight_layout()

        # save figure
        plot.savefig(self.pdffile)
Пример #26
0
def plotSnpDensity(snpDensityWindowSize, ax, chrSNPs, labels):

# fuegt der grafik ein moving average ueber die SNP density hinzu, sowie ein dazugehoeriges kernel density estimate
    ax2 = ax.twinx()
    ax2.set_ylabel(r"Snps per Window")
    
    if chrSNPs['Position'].irow(-1)>=snpDensityWindowSize :
            print "calculating SNP density"
            (movingAveragePoolValues , positions) = movingAverageOverWindow(chrSNPs, snpDensityWindowSize, snpDensityWindowSize/2)
            
            x = chrSNPs['Position']
            density = kde.gaussian_kde(x,bw_method = 'silverman' )
            xgrid = numpy.linspace(x.min(), x.max(),  len(x))#chrSNPs['Position'].irow(-1)/windowSize)


    else:
            print "not enough values to calculate moving average for SNPs"
    if len(movingAveragePoolValues) == len(positions):
            labels['snp density'], = ax2.plot(positions,movingAveragePoolValues) #color = mpl.rcParams['axes.color_cycle'][pools.index(pool)+2],label= pool + " snp density " + str(windowSize/1000) + "kb")
            labels['snp kde'], = ax2.plot(xgrid, density(xgrid)*1000000000, 'r-')
            n, bons, patches = ax2.hist(x, bins=8, normed=True)
    else:
            print "no snp density plotted"

    return ax2
Пример #27
0
def kerndens(vec,nbins=100):
    hist(vec, color='g', bins=nbins, normed=True, align='mid')
#    figure(2)    
    gkde = gaussian_kde(vec)
    plot(arange(0,(1.01*(max(vec)-min(vec))),.1),
         gkde.evaluate(arange(0,(1.01*(max(vec)-min(vec))),.1)))   
    show()
Пример #28
0
def test(x,y):
    print type(x)
    print type(y) 
    nbins = 20

    fig, axes = plt.subplots(ncols=2, nrows=2, sharex=True, sharey=True)

    axes[0, 0].set_title('Scatterplot')
    axes[0, 0].plot(x, y, 'ko')
  
    axes[0, 1].set_title('Hexbin plot')
    axes[0, 1].hexbin(x, y, gridsize=nbins)

    axes[1, 0].set_title('2D Histogram')
    axes[1, 0].hist2d(x, y, bins=nbins)

    # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
    k  = kde.gaussian_kde(data.T)
    xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))

    axes[1, 1].set_title('Gaussian KDE')
    axes[1, 1].pcolormesh(xi, yi, zi.reshape(xi.shape))

    fig.tight_layout()
    plt.savefig('fig2.png')
Пример #29
0
def PlotKernel(band = 'alpha'):

    befores = joined[band+'_before'].as_matrix()
    afters = joined[band+'_after'].as_matrix()

    samp_split = np.vstack((befores, afters)).T
    samp_split = np.delete(samp_split, [1,26,36], 0)
    samp_joined = np.hstack((befores, afters))
    samp_joined =  samp_joined[ np.where(samp_joined <26)]
    x = np.linspace(0,28,1000)

    fig = plt.figure()
    ax = fig.add_subplot(111)

    my_pdf = gaussian_kde(samp_joined)
    for i in range (0,len(samp_split)):

      #  kernel = signal.gaussian(np.mean(samp[i,:]), std = 1)
        #sub = samp[i,:]
        # obtaining the pdf (my_pdf is a function!)
      #  my_pdf = gaussian_kde(sub)
        ax.scatter(samp_split[i,0], i, color = 'blue')
        ax.scatter(samp_split[i,1], i, color = 'red')
        # plotting the result

    #fig2 = plt.figure()
    #ax2 = fig2.add_subplot(111)
    ax.plot(x, my_pdf(x) *400,'r') # distribution function
Пример #30
0
def distparams(dist):
    """
    Description:
    ------------
    Return robust statistics of a distribution of data values

    Example:
    --------
    med,mode,interval,lo,hi = distparams(dist)
    """

    from scipy.stats.kde import gaussian_kde
    from scipy.interpolate import interp1d
    vals = np.linspace(np.min(dist)*0.5,np.max(dist)*1.5,1000)
    kde = gaussian_kde(dist)
    pdf = kde(vals)
    dist_c = np.cumsum(pdf)/np.sum(pdf)
    func = interp1d(dist_c,vals,kind='linear')
    lo = np.float(func(math.erfc(1./np.sqrt(2))))
    hi = np.float(func(math.erf(1./np.sqrt(2))))
    med = np.float(func(0.5))
    mode = vals[np.argmax(pdf)]

    disthi = np.linspace(.684,.999,100)
    distlo = disthi-0.6827
    disthis = func(disthi)
    distlos = func(distlo)
    
    interval = np.min(disthis-distlos)

    return med,mode,interval,lo,hi
this_df = df.filter(regex=mytime, axis=1)

# compute beam radius

this_df = this_df.iloc[idx_arrived, :]
qx = this_df.iloc[:, 0]
median_qx = np.median(qx)
qy = this_df.iloc[:, 1]
qz = this_df.iloc[:, 2]
qr = np.sqrt(qy**2 + qz**2)

nbins = 500
x = qy
y = qz
data = np.vstack([qy, qz])
k = kde.gaussian_kde(data)
# xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
xi, yi = np.mgrid[-3:3:nbins * 1j, -3:3:nbins * 1j]
zi = k(np.vstack([xi.flatten(), yi.flatten()]))
# scale between 0 and 1
zi = (zi - np.min(zi)) / (np.max(zi) - np.min(zi))
# print(zi)

f = plt.figure(1, figsize=(7, 7))
# plt.title('KDE Gaussian on target for run \n {}'.format(type_file))
nullfmt = NullFormatter()  # no labels

# definitions for the axes
left, width = 0.05, 0.65
bottom, height = 0.05, 0.65
bottom_h = left_h = left + width + 0.02
Пример #32
0
           borderaxespad=0.)

plt.figure(39)
plt.plot(c31_mat, label="KF1", color='b')
plt.plot(c32_mat, label="KF2", color='g')
plt.ylabel(r'$\theta_3$', fontsize=22)
plt.xlabel('runs', fontsize=14)
plt.ylim((0.1, 0.4))
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102),
           loc=3,
           ncol=2,
           mode="expand",
           borderaxespad=0.)

plt.figure()
density1 = kde.gaussian_kde(gr_lik1, 0.4)
dist_space = np.linspace(min(gr_lik1), max(gr_lik1), 1000)
plt.hist(gr_lik1, bins=10, normed=True, histtype='stepfilled', alpha=0.2)
prior = plt.plot(dist_space, density1(dist_space), label="KF1", color='r')
density2 = kde.gaussian_kde(thinned21, 0.4)
dist_space2 = np.linspace(min(thinned21), max(thinned21), 1000)
post = plt.plot(dist_space2, density2(dist_space2), label="KF2", color='g')
plt.hist(thinned21, bins=10, normed=True, histtype='stepfilled', alpha=0.2)
plt.vlines(c_real[0], 0, 70, colors=u'b')
plt.xlabel(r'$\theta_1$', fontsize=22)
plt.ylabel(r'$p(\theta_1|data)$', fontsize=18)
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102),
           loc=3,
           ncol=2,
           mode="expand",
           borderaxespad=0.)
Пример #33
0
    def perform_kde(self, x_, y_):

        ki = kde.gaussian_kde([x_, y_])
        zi = np.array(ki(np.vstack([self.xi.flatten(), self.yi.flatten()])))
        return zi
Пример #34
0
            yhat_ts = predicted_yhat_ts[clf_param_key]

        # plt.figure()
        # plt.hist(yhat_tr[:, 1], 20, histtype='step', color='r', linewidth=2, label='train')
        # plt.hist(yhat_ts[:, 1], 20, histtype='step', color='b', linewidth=2, label='test')
        # plt.title('Patient {0}'.format(key))
        # plt.xlim(0, 1)
        # plt.grid()
        # plt.legend()
        # # plt.show()

        from scipy.stats.kde import gaussian_kde
        from scipy.stats import entropy

        # Estimating the pdf and plotting
        pdf_tr = gaussian_kde(yhat_tr[:, 1])
        pdf_ts = gaussian_kde(yhat_ts[:, 1])
        x = np.linspace(0, 1, 100)

        en = entropy(pdf_tr(x), pdf_ts(x))
        # print key-1
        # print dkl.shape
        dkl[key - 1] = en

        # plt.figure()
        # plt.plot(x, pdf_tr(x), color='r', linewidth=2, label='train')
        # plt.plot(x, pdf_ts(x), color='b', linewidth=2, label='test')
        # # plt.hist(d1_np,normed=1,color="cyan",alpha=.8)
        # # plt.plot(x,norm.pdf(x,mu,stdv),label="parametric distribution",color="red")
        # plt.legend()
        # plt.grid()
Пример #35
0
import matplotlib.pyplot as plt
import pandas
import numpy as np
from scipy.stats import kde

adult_csv = pandas.read_csv('../adult.csv')
adult_age = (adult_csv.iloc[:, 0] -
             (adult_csv.iloc[:, 0].mean())) / (adult_csv.iloc[:, 0].max() -
                                               adult_csv.iloc[:, 0].min())
adult_salary = (adult_csv.iloc[:, 2] -
                (adult_csv.iloc[:, 2].mean())) / (adult_csv.iloc[:, 2].max() -
                                                  adult_csv.iloc[:, 2].min())

nbins = 75

k = kde.gaussian_kde([adult_salary, adult_age])

xi, yi = np.mgrid[adult_salary.min():adult_salary.max():nbins * 1j,
                  adult_age.min():adult_age.max():nbins * 1j]
zi = k(np.vstack([xi.flatten(), yi.flatten()]))

# Make the plot
plt.pcolormesh(xi, yi, zi.reshape(xi.shape))

plt.savefig('density_plot.png')
plt.show()
RY = Y / Y.mean(axis=0)
plt.plot(years, RY[0])

name = np.array(name)

for msoa in list(name[:5]):
    plt.plot(years, RY[np.nonzero(name == msoa)[0][0]], label=msoa)
plt.legend()

# Spaghetti plot
for row in RY:
    plt.plot(years, row)

# Kernel Density (univariate, aspatial)
density = gaussian_kde(Y[:, 0])
minY0 = Y[:, 0].min() * .90
maxY0 = Y[:, 0].max() * 1.10
x = np.linspace(minY0, maxY0, 100)
plt.plot(x, density(x))

d2017 = gaussian_kde(Y[:, -1])
minY0 = Y[:, -1].min() * .90
maxY0 = Y[:, -1].max() * 1.10
x = np.linspace(minY0, maxY0, 100)
plt.plot(x, d2017(x))

minR0 = RY.min()
maxR0 = RY.max()
x = np.linspace(minR0, maxR0, 100)
d2007 = gaussian_kde(RY[:, 0])
Пример #37
0
                 index_col=False)

print(AH.shape)
print(len(AH))
print(AH.dtypes)
print(AH.describe(include='all'))
# AH['SalePrice'].hist()
print(style.available)
# fig, ax = plt.subplots()
# plt.hist(AH['SalePrice'], bins=60, log=True)#.log.hist(bins=60, density=1)

from scipy.stats.kde import gaussian_kde
from numpy import linspace, hstack
from pylab import plot, show, hist

my_density = gaussian_kde(AH['SalePrice'])
x = linspace(min(AH['SalePrice']), max(AH['SalePrice']), 1000)
# plot(x, my_density(x), 'g')
# hist(AH['SalePrice'], normed=1, alpha=0.3)
# AH.groupby('MS Zoning')['SalePrice'].plot.hist(density=1, alpha=0.6)
# ax = AH.boxplot(column='SalePrice', by='MS Zoning')
# ax.get_figure().suptitle('')
print(AH["MS Zoning"].value_counts())
# plt.show()
# print(AH.head())
town = pd.read_csv('./Shad_Python_01_2/town_1959_2/town_1959_2.csv',
                   encoding='cp1251',
                   index_col=False)
print(town)
print(town.describe())  #среднее
print(town.describe().mean)
Пример #38
0
    # simulation
    for i in range(0, seq_length):

        plt.clf()

        # get reading
        measurement = sensor_seq[i, :]

        # update
        mcl.update(measurement, speed)

        # get the location using kde, rather than simple mean

        # this create the kernel, given an array it will estimate the probability over that values
        kde = gaussian_kde(np.transpose(mcl.x_t[:, 0]))
        # these are the values over wich your kernel will be evaluated
        pdf = kde(range(mcl.map_length))
        inds = np.argmax(pdf)
        est = np.mean(inds)

        print('Estimate')
        print(est)

        true_location = start_pos + i * speed

        # plot the results
        plt.scatter(mcl.x_t[:, 0],
                    range(mcl.npart),
                    s=1,
                    c='k',
Пример #39
0
train_set = np.append(train_set, all_samples[3][0:40], axis=0)
train_set = np.append(train_set, all_samples[4][0:40], axis=0)
train_set = np.append(train_set, all_samples[5][0:40], axis=0)

test_set = np.append(all_samples[1][40:80], all_samples[2][40:80], axis=0)
test_set = np.append(test_set, all_samples[3][40:80], axis=0)
test_set = np.append(test_set, all_samples[4][40:80], axis=0)
test_set = np.append(test_set, all_samples[5][40:80], axis=0)

# 40 for training and 40 for testing for each class, thus training data and testing data should have
# size of (40 * 5 = 200, 3) shape, last column is the class column
assert (train_set.shape == (200, 3))
assert (test_set.shape == (200, 3))

for bw in window_bw:
    class1_kde = kde.gaussian_kde(train_set[train_set[:, 2] == 1].T[0:2],
                                  bw_method=bw)
    class2_kde = kde.gaussian_kde(train_set[train_set[:, 2] == 2].T[0:2],
                                  bw_method=bw)

    classification_dict, error = empirical_error(test_set, [1, 2],
                                                 bayes_classifier,
                                                 [[class1_kde, class2_kde]])

    labels_predicted = ['w{} (predicted)'.format(i) for i in [1, 2]]
    labels_predicted.insert(0, 'test dataset')

    train_conf_mat = prettytable.PrettyTable(labels_predicted)
    for i in [1, 2]:
        a, b = [classification_dict[i][j] for j in [1, 2]]
        # workaround to unpack (since Python does not support just '*a')
        train_conf_mat.add_row(['w{} (actual)'.format(i), a, b])
Пример #40
0
def plot_persistence_density(persistence=[],
                             persistence_file="",
                             nbins=300,
                             bw_method=None,
                             max_intervals=1000,
                             dimension=None,
                             cmap=None,
                             legend=False,
                             axes=None,
                             fontsize=16,
                             greyblock=False):
    """This function plots the persistence density from persistence
    values list, np.array of shape (N x 2) representing a diagram
    in a single homology dimension,
    or from a :doc:`persistence file <fileformats>`. Be
    aware that this function does not distinguish the dimension, it is
    up to you to select the required one. This function also does not handle
    degenerate data set (scipy correlation matrix inversion can fail).

    :param persistence: Persistence intervals values list. 
                        Can be grouped by dimension or not.
    :type persistence: an array of (dimension, array of (birth, death)) 
                        or an array of (birth, death).
    :param persistence_file: A :doc:`persistence file <fileformats>`
        style name (reset persistence if both are set).
    :type persistence_file: string
    :param nbins: Evaluate a gaussian kde on a regular grid of nbins x
        nbins over data extents (default is 300)
    :type nbins: int.
    :param bw_method: The method used to calculate the estimator
        bandwidth. This can be 'scott', 'silverman', a scalar constant
        or a callable. If a scalar, this will be used directly as
        kde.factor. If a callable, it should take a gaussian_kde
        instance as only parameter and return a scalar. If None
        (default), 'scott' is used. See
        `scipy.stats.gaussian_kde documentation
        <http://scipy.github.io/devdocs/generated/scipy.stats.gaussian_kde.html>`_
        for more details.
    :type bw_method: str, scalar or callable, optional.
    :param max_intervals: maximal number of points used in the density
        estimation.
        Selected intervals are those with the longest life time. Set it
        to 0 to see all. Default value is 1000.
    :type max_intervals: int.
    :param dimension: the dimension to be selected in the intervals
        (default is None to mix all dimensions).
    :type dimension: int.
    :param cmap: A matplotlib colormap (default is
        matplotlib.pyplot.cm.hot_r).
    :type cmap: cf. matplotlib colormap.
    :param legend: Display the color bar values (default is False).
    :type legend: boolean.
    :param axes: A matplotlib-like subplot axes. If None, the plot is drawn on
        a new set of axes.
    :type axes: `matplotlib.axes.Axes`
    :param fontsize: Fontsize to use in axis.
    :type fontsize: int
    :param greyblock: if we want to plot a grey patch on the lower half plane 
                         for nicer rendering. Default False.
    :type greyblock: boolean
    :returns: (`matplotlib.axes.Axes`): The axes on which the plot was drawn.
    """
    try:
        import matplotlib.pyplot as plt
        import matplotlib.patches as mpatches
        from scipy.stats import kde
        from matplotlib import rc
        plt.rc('text', usetex=True)
        plt.rc('font', family='serif')

        persistence = _array_handler(persistence)

        if persistence_file != "":
            if dimension is None:
                # All dimension case
                dimension = -1
            if path.isfile(persistence_file):
                persistence_dim = read_persistence_intervals_in_dimension(
                    persistence_file=persistence_file, only_this_dim=dimension)
            else:
                print("file " + persistence_file + " not found.")
                return None

        if len(persistence) > 0:
            persistence_dim = np.array([
                (dim_interval[1][0], dim_interval[1][1])
                for dim_interval in persistence
                if (dim_interval[0] == dimension) or (dimension is None)
            ])

        persistence_dim = persistence_dim[np.isfinite(persistence_dim[:, 1])]
        if max_intervals > 0 and max_intervals < len(persistence_dim):
            # Sort by life time, then takes only the max_intervals elements
            persistence_dim = np.array(
                sorted(
                    persistence_dim,
                    key=lambda life_time: life_time[1] - life_time[0],
                    reverse=True,
                )[:max_intervals])

        # Set as numpy array birth and death (remove undefined values - inf and NaN)
        birth = persistence_dim[:, 0]
        death = persistence_dim[:, 1]

        # default cmap value cannot be done at argument definition level as matplotlib is not yet defined.
        if cmap is None:
            cmap = plt.cm.hot_r
        if axes == None:
            fig, axes = plt.subplots(1, 1)

        # line display of equation : birth = death
        x = np.linspace(death.min(), birth.max(), 1000)
        axes.plot(x, x, color="k", linewidth=1.0)

        # Evaluate a gaussian kde on a regular grid of nbins x nbins over data extents
        k = kde.gaussian_kde([birth, death], bw_method=bw_method)
        xi, yi = np.mgrid[birth.min():birth.max():nbins * 1j,
                          death.min():death.max():nbins * 1j, ]
        zi = k(np.vstack([xi.flatten(), yi.flatten()]))

        # Make the plot
        img = axes.pcolormesh(xi, yi, zi.reshape(xi.shape), cmap=cmap)

        if greyblock:
            axes.add_patch(
                mpatches.Polygon([[birth.min(), birth.min()],
                                  [death.max(), birth.min()],
                                  [death.max(), death.max()]],
                                 fill=True,
                                 color='lightgrey'))

        if legend:
            plt.colorbar(img, ax=axes)

        axes.set_xlabel("Birth", fontsize=fontsize)
        axes.set_ylabel("Death", fontsize=fontsize)
        axes.set_title("Persistence density", fontsize=fontsize)

        return axes

    except ImportError:
        print(
            "This function is not available, you may be missing matplotlib and/or scipy."
        )
Пример #41
0
def getPDF(values):
    # Draws graphs of probability density functions for the given values distribution
    gkde = gaussian_kde(values)
    return gkde
Пример #42
0
def MVKDE(S, J, proportion_matrix, filename=None, plot=False, bandwidth=0.25):
    """
    Generates a Multivariate Kernel Density Estimator and returns a
    matrix representing a probability distribution according to given
    age categories, and ability type categories.

    Args:
        S (scalar): the number of age groups in the model
        J (scalar): the number of ability type groups in the model.
        proportion_matrix (Numpy array): SxJ shaped array that
            represents the proportions of the total going to each
            (s,j) combination
        filename (str): the file name  to save image to
        plot (bool): whether or not to save a plot of the probability
            distribution generated by the kde or the proportion matrix
        bandwidth (scalar):  used in the smoothing of the kernel. Higher
            bandwidth creates a smoother kernel.

    Returns:
        estimator_scaled (Numpy array): SxJ shaped array that
            that represents the smoothed distribution of proportions
            going to each (s,j)

    """
    proportion_matrix_income = np.sum(proportion_matrix, axis=0)
    proportion_matrix_age = np.sum(proportion_matrix, axis=1)
    age_probs = np.random.multinomial(70000, proportion_matrix_age)
    income_probs = np.random.multinomial(70000, proportion_matrix_income)
    age_frequency = np.array([])
    income_frequency = np.array([])
    age_mesh = complex(str(S) + "j")
    income_mesh = complex(str(J) + "j")
    j = 18
    """creating a distribution of age values"""
    for i in age_probs:
        listit = np.ones(i)
        listit *= j
        age_frequency = np.append(age_frequency, listit)
        j += 1

    k = 1
    """creating a distribution of ability type values"""
    for i in income_probs:
        listit2 = np.ones(i)
        listit2 *= k
        income_frequency = np.append(income_frequency, listit2)
        k += 1

    freq_mat = np.vstack((age_frequency, income_frequency)).T
    density = kde.gaussian_kde(freq_mat.T, bw_method=bandwidth)
    age_min, income_min = freq_mat.min(axis=0)
    age_max, income_max = freq_mat.max(axis=0)
    agei, incomei = np.mgrid[
        age_min:age_max:age_mesh, income_min:income_max:income_mesh
    ]
    coords = np.vstack([item.ravel() for item in [agei, incomei]])
    estimator = density(coords).reshape(agei.shape)
    estimator_scaled = estimator / float(np.sum(estimator))
    if plot:
        fig = plt.figure()
        ax = fig.gca(projection="3d")
        ax.plot_surface(agei, incomei, estimator_scaled, rstride=5)
        ax.set_xlabel("Age")
        ax.set_ylabel("Ability Types")
        ax.set_zlabel("Received proportion of total bequests")
        plt.savefig(filename)
    return estimator_scaled
Пример #43
0
def plotContour(filename, source=False, particle='all'):

    df = pd.read_hdf(filename, keys='procdf')

    if particle == 'all':
        x = np.array(df['x'])
        y = np.array(df['y'])
        z = np.array(df['z'])
        energy = np.array(df['energy'] * 1000)
        plot_title = 'Spot Size, $^{241}$Am 10$^7$ Primaries, all energies'

    elif particle == 'alpha':
        alpha_df = df.loc[df.energy > 5]
        x = np.array(alpha_df['x'])
        y = np.array(alpha_df['y'])
        z = np.array(alpha_df['z'])
        energy = np.array(alpha_df['energy'] * 1000)
        plot_title = 'Spot Size, $^{241}$Am 10$^7$ Primaries, Energy $>$ 5 MeV'

    elif particle == 'gamma':
        gamma_df = df.loc[(df.energy > .04) & (df.energy < 0.08)]
        x = np.array(gamma_df['x'])
        y = np.array(gamma_df['y'])
        z = np.array(gamma_df['z'])
        energy = np.array(gamma_df['energy'] * 1000)
        plot_title = 'Spot Size, $^{241}$Am 10$^7$ Primaries, 60 kev $<$ Energy $<$ 80 keV'

    else:
        print('specify particle type!')
        exit()

    fig, ax = plt.subplots(ncols=3)
    nbins = 50
    counts, xbins, ybins = np.histogram2d(x, y, bins=nbins, normed=True)
    ax[0].hist2d(x, y, bins=nbins, cmap='plasma', normed=True)
    # plt.scatter(x, y, c=energy, s=1, cmap='plasma')
    # cb = plt.colorbar()
    # cb.set_label("Energy (keV)", ha = 'right', va='center', rotation=270, fontsize=14)
    # cb.ax.tick_params(labelsize=12)
    ax[0].set_xlim(-10, 10)
    ax[0].set_ylim(9, 19)

    # k_arr = np.column_stack((x,y))
    # k = kde.gaussian_kde(k_arr.T)
    xi, yi = np.mgrid[x.min():x.max():nbins * 1j, y.min():y.max():nbins * 1j]
    # zi = k(np.vstack([xi.flatten(), yi.flatten()]))
    positions = np.vstack([xi.flatten(), yi.flatten()])
    values = np.vstack([x, y])
    kernel = kde.gaussian_kde(values)
    zi = np.reshape(kernel(positions).T, xi.shape)
    print(np.sum(zi))
    scale = len(x) / np.sum(zi)
    zi *= scale
    # print(np.sum(counts))
    # print(np.min(zi), np.max(zi))
    # exit()

    # norm = np.linalg.norm(zi)
    # norm_zi = zi/norm
    # print(xi.flatten())
    # exit()
    # ax[1].pcolormesh(xi, yi, zi.reshape(xi.shape), cmap='plasma')
    ax[1].pcolormesh(xi, yi, zi, cmap='plasma')
    # ax[1].pcolormesh(xi, yi, norm_zi.reshape(xi.shape), cmap='plasma')
    ax[1].set_xlim(-10, 10)
    ax[1].set_ylim(9, 19)

    levels = [0.1]

    # contour_hist = ax[2].contour(counts.T,extent=[xbins.min(),xbins.max(),ybins.min(),ybins.max()],cmap='plasma')
    # CS = ax[2].contour(xi, yi, zi.reshape(xi.shape), cmap='plasma')
    CS = ax[2].contour(xi, yi, zi, cmap='plasma')
    # CSF = ax[2].contourf(xi, yi, norm_zi.reshape(xi.shape), cmap='plasma')
    # CSF = ax[2].contourf(xi, yi, zi.reshape(xi.shape), cmap='plasma')
    # plt.clabel(CS, fmt = '%2.1d', colors = 'k', fontsize=14)
    ax[2].clabel(CS, fmt='%.2f', fontsize=20)
    CB = plt.colorbar(CS, shrink=0.8, extend='both')

    ax[2].set_xlim(-10, 10)
    ax[2].set_ylim(9, 19)
    # CB = plt.colorbar(contour_hist, shrink=0.8, extend='both')
    # ax[2].clabel(contour_hist, fmt = '%.2f', fontsize=20)

    # plt.xlim(-40,40)
    # plt.ylim(-40,40)
    # ax[0].set_xlabel('x position (mm)', fontsize=16)
    # ax[0].set_ylabel('y position (mm)', fontsize=16)
    # plt.setp(ax[0].get_xticklabels(), fontsize=14)
    # plt.setp(ax[0].get_yticklabels(), fontsize=14)
    # plt.title(plot_title, fontsize=16)
    plt.show()

    if source == True:
        source_df = pd.read_hdf(filename, keys='sourcePV_df')
        sourceEnergy = np.array(source_df['energy'] * 1000)
        x_source = np.array(source_df['x'])
        print(len(x_source))
Пример #44
0
    def plot_posterior_op(trace_values, ax):
        def format_as_percent(x, round_to=0):
            value = np.round(100 * x, round_to)
            if round_to == 0:
                value = int(value)
            return '{}%'.format(value)

        def display_ref_val(ref_val):
            less_than_ref_probability = (trace_values < ref_val).mean()
            greater_than_ref_probability = (trace_values >= ref_val).mean()
            ref_in_posterior = format_as_percent(
                less_than_ref_probability,
                1) + ' <{:g}< '.format(ref_val) + format_as_percent(
                    greater_than_ref_probability, 1)
            ax.axvline(ref_val,
                       ymin=0.02,
                       ymax=.75,
                       color='g',
                       linewidth=4,
                       alpha=0.65)
            ax.text(trace_values.mean(),
                    plot_height * 0.6,
                    ref_in_posterior,
                    size=14,
                    horizontalalignment='center')

        def display_rope(rope):
            pc_in_rope = format_as_percent(
                np.sum((trace_values > rope[0]) & (trace_values < rope[1])) /
                len(trace_values), round_to)
            ax.plot(rope, (plot_height * 0.02, plot_height * 0.02),
                    linewidth=20,
                    color='r',
                    alpha=0.75)
            text_props = dict(size=16, horizontalalignment='center', color='r')
            ax.text(rope[0], plot_height * 0.14, rope[0], **text_props)
            ax.text(rope[1], plot_height * 0.14, rope[1], **text_props)

        def display_point_estimate():
            if not point_estimate:
                return
            if point_estimate not in ('mode', 'mean', 'median'):
                raise ValueError(
                    "Point Estimate should be in ('mode','mean','median', None)"
                )
            if point_estimate == 'mean':
                point_value = trace_values.mean()
                point_text = '{}={}'.format(point_estimate,
                                            point_value.round(round_to))
            elif point_estimate == 'mode':
                point_value = stats.mode(trace_values.round(round_to))[0][0]
                point_text = '{}={}'.format(point_estimate,
                                            point_value.round(round_to))
            elif point_estimate == 'median':
                point_value = np.median(trace_values)
                point_text = '{}={}'.format(point_estimate,
                                            point_value.round(round_to))

            ax.text(point_value,
                    plot_height * 0.8,
                    point_text,
                    size=16,
                    horizontalalignment='center')

        def display_hpd():
            hpd_intervals = hpd(trace_values, alpha=alpha_level)
            ax.plot(hpd_intervals, (plot_height * 0.02, plot_height * 0.02),
                    linewidth=4,
                    color='k')
            text_props = dict(size=16, horizontalalignment='center')
            ax.text(hpd_intervals[0], plot_height * 0.07,
                    hpd_intervals[0].round(round_to), **text_props)
            ax.text(hpd_intervals[1], plot_height * 0.07,
                    hpd_intervals[1].round(round_to), **text_props)
            ax.text((hpd_intervals[0] + hpd_intervals[1]) / 2,
                    plot_height * 0.2,
                    format_as_percent(1 - alpha_level) + ' HPD', **text_props)

        def format_axes():
            ax.yaxis.set_ticklabels([])
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['bottom'].set_visible(True)
            ax.yaxis.set_ticks_position('none')
            ax.xaxis.set_ticks_position('bottom')
            ax.tick_params(axis='x',
                           direction='out',
                           width=1,
                           length=3,
                           color='0.5')
            ax.spines['bottom'].set_color('0.5')

        def set_key_if_doesnt_exist(d, key, value):
            if key not in d:
                d[key] = value

        if kde_plot:
            density = kde.gaussian_kde(trace_values)
            l = np.min(trace_values)
            u = np.max(trace_values)
            x = np.linspace(0, 1, 100) * (u - l) + l
            ax.plot(x, density(x), **kwargs)
        else:
            set_key_if_doesnt_exist(kwargs, 'bins', 30)
            set_key_if_doesnt_exist(kwargs, 'edgecolor', 'w')
            set_key_if_doesnt_exist(kwargs, 'align', 'right')
            ax.hist(trace_values, **kwargs)

        plot_height = ax.get_ylim()[1]

        format_axes()
        display_hpd()
        display_point_estimate()
        if ref_val is not None:
            display_ref_val(ref_val)
        if rope is not None:
            display_rope(rope)
Пример #45
0
    #datplot = data_b4_filt2

    #Take log of data so it's properly weighted on a linear scale
    datx = np.log10(np.asarray(datplot[xkey]))
    daty = np.log10(np.asarray(datplot[ykey]))

    #Keep only non-NaN values
    tstx = np.isfinite(datx)
    tsty = np.isfinite(daty)
    good = np.logical_and(tstx, tsty)

    #Numpy version
    datx2np = datx[good]
    daty2np = daty[good]
    datnp = np.vstack([datx2np, daty2np])
    k = kde.gaussian_kde(datnp)

    nbins = 30.
    xi, yi = np.mgrid[datx2np.min():datx2np.max():nbins * 1j,
                      daty2np.min():daty2np.max():nbins * 1j]
    #zi = np.log10(k(np.vstack([xi.flatten(), yi.flatten()])))
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))

    #Turn grid points back into linear scale
    xi2 = [10**x for x in xi]
    yi2 = [10**x for x in yi]

    #Set axes ranges
    xmin = np.min(datplot[xkey])
    xmax = np.max(datplot[xkey])
    ymin = np.min(datplot[ykey])
Пример #46
0
def histograms():
    genes = [
        "Trpm3",
        "mt-Co1",
        "mt-Co3",
        "Nnat",
        "Ptgds",
        "Adam12",
        "Alcam",  # Up early
        "Itih5",
        "Malat1",
        "Zbtb20",
        "Spp1",
        "Col15a1",
        "Ece1",
        "Cemip",  # Up late
    ]

    ab = df_expr_ab.reindex(genes, axis=1).dropna(axis=1)
    bh = df_expr_bh.reindex(genes, axis=1).dropna(axis=1)
    assert list(ab.columns) == list(bh.columns)

    # Norm1 normalization
    # ab = ab.div(ab.sum(axis=1), axis=0)
    # bh = bh.div(bh.sum(axis=1), axis=0)

    # log1p trafo
    ab = ab.transform(lambda x: np.log(x + 1))
    bh = bh.transform(lambda x: np.log(x + 1))

    ab = ab[df_meta_ab.subclass_label == "VLMC"]
    bh = bh[df_meta_bh.celltype.str.startswith("FB")]
    assert list(ab.columns) == list(bh.columns)

    genes = sorted(ab.columns)

    # g = first(genes)

    colors = {
        'FB1': 'purple',
        'FB2': 'violet',
        '374_VLMC': "C0",
        '375_VLMC': "C1",
        '376_VLMC': "C2",
    }

    for g in genes:
        expr: pd.Series
        with Plox() as px:
            grps = [
                ab[g].groupby(
                    df_meta_ab.reindex(ab.index).cell_type_alias_label),
                bh[g].groupby(df_meta_bh.reindex(bh.index).celltype),
            ]

            for grp in grps:
                for (label, expr) in grp:
                    if any(expr):
                        f = gaussian_kde(expr)
                        xx = np.linspace(0, max(expr) * 1.5, 100)
                        px.a.plot(xx,
                                  f(xx),
                                  label=f"{label} ({len(expr)})",
                                  color=colors[label])

            px.a.set_xlabel("log1p(count)")
            px.a.legend()
            px.a.set_yticks([])
            px.f.savefig(out_dir / f"hist_{g}.png")
Пример #47
0
def plot_dist_byfrag(fragn, frags, category, fig_name):
    """
    Plot neighboring fragment distance distribution categorized by fragment number.
    """
    maxfn = max(fragn)
    minfn = min(fragn)
    if maxfn < 5:
        breakPoints = [range(2, max(fragn) + 1)]
        legendLab = ['2-max']
    elif maxfn > 4 and maxfn < 10:
        if minfn < 5:
            breakPoints = [range(2, 5), range(5, maxfn + 1)]
            legendLab = ['2-4', '5-max']
        else:
            breakPoints = [range(5, maxfn + 1)]
            legendLab = ['5-max']
    else:
        if minfn < 5:
            breakPoints = [range(2, 5), range(5, 10), range(10, maxfn + 1)]
            legendLab = ['2-4', '5-9', '10-max']
        elif minfn > 4 and minfn < 10:
            breakPoints = [range(5, 10), range(10, maxfn + 1)]
            legendLab = ['5-9', '10-max']
        else:
            breakPoints = [range(10, maxfn + 1)]
            legendLab = ['10-max']
    fdistbyfnum = [
        [] for i in range(max(fragn) + 1)
    ]  # distByFragNum[i] contains all distances for i fragments / GEM
    fdist = []
    for i in range(len(frags)):
        coords = [x.split(':')[1].split('-') for x in frags[i]]
        init_dist = [
            int(coords[j + 1][0]) - int(coords[j][1])
            for j in range(0,
                           len(coords) - 1)
        ]
        dist = [x for x in init_dist if x > 3000]
        if len(dist) > 0:
            fdistbyfnum[len(dist) + 1].extend(dist)
            fdist.extend(dist)

    neigh_distFrag = [[] for i in range(len(breakPoints))]
    for k in range(len(breakPoints)):
        for x in breakPoints[k]:
            neigh_distFrag[k].extend(fdistbyfnum[x])

    dist_space = linspace(3, 8, 100)
    for y in range(len(breakPoints)):
        plt.plot(dist_space,
                 gaussian_kde(np.log10(neigh_distFrag[y]))(dist_space),
                 linewidth=4)
    plt.legend(legendLab, title="Fragment #")
    plt.title("F2F distance in " + str(len(fragn)) + " complexes (" +
              category + ")")
    plt.xlabel("Log10(Fragment-to-fragment distance)")
    plt.ylabel("Relative Density")
    plt.savefig(fig_name + 'f2f_by_fnum.pdf', dpi=300)
    plt.close()

    dist_space = linspace(3, 8, 100)
    plt.plot(dist_space, gaussian_kde(np.log10(fdist))(dist_space))
    plt.title("F2F distance in " + str(len(fragn)) + " complexes (" +
              category + ")")
    plt.xlabel("Log10(Fragment-to-fragment distance)")
    plt.ylabel("Relative Density")
    #plt.show()
    plt.savefig(fig_name + 'f2f_all.pdf', dpi=300)
    plt.close()

    del fdistbyfnum
    del neigh_distFrag
    return fdist
Пример #48
0
def plot_network_randomization(avg_metric_sub,
                               avg_metric_dom,
                               metric_sub,
                               metric_dom,
                               ylabel,
                               xlim_hist,
                               alpha_level=0.05,
                               dom_color=None,
                               sub_color=None):
    '''Visualize network randomization test and calculate two-tailed p-value. Refer to the example notebook for network randomization.

    Parameters
    ----------
    avg_metric_sub : np.ndarray
        Contains the metric of sub obtained from network randomizations
    avg_metric_dom : np.ndarray
        Contains the metric of dom obtained from network randomizations
    metric_sub : np.ndarray
        Contains the observed metric of sub
    metric_dom : np.ndarray
        Contains the observed metric of dom
    ylabel : string
        Y axis label for the randomization plot
    xlim_hist : (float, float)
        X axis limits for the histogram.
    alpha_level : float, optional
        Significance level for test visualization in the range of [0, 1]. Defaults to 0.05
    dom_color : (float, float, float), optional
        RGB color for dom in the range of [0, 1]
    sub_color : (float, float, float), optional
        RGB color for sub in the range of [0, 1]

    Returns
    -------
    string
        A formatted p-value
    '''

    if dom_color is None:
        dom_color = tuple(v / 255 for v in (255, 109, 69))
    if sub_color is None:
        sub_color = tuple(v / 255 for v in (39, 170, 214))
    mean_values_sub = []
    for dist in np.concatenate(
        [trial for trial in np.array(avg_metric_sub).reshape(6, 1000, -1)],
            axis=1):
        dist = dist[np.isfinite(dist)]
        mean_values_sub.append(dist.mean())
    mean_values_dom = []
    for dist in np.concatenate(
        [trial for trial in np.array(avg_metric_dom).reshape(6, 1000, -1)],
            axis=1):
        dist = dist[np.isfinite(dist)]
        mean_values_dom.append(dist.mean())
    mean_values_dom = np.array(mean_values_dom)
    mean_values_sub = np.array(mean_values_sub)
    fig, axes = plt.subplots(1,
                             2,
                             figsize=(12, 4),
                             gridspec_kw={'width_ratios': [0.5, 0.4]})
    lc = LineCollection(np.transpose([
        np.repeat(1, mean_values_dom.size), mean_values_dom,
        np.repeat(4, mean_values_sub.size), mean_values_sub
    ]).reshape(-1, 2, 2),
                        lw=0.5,
                        alpha=0.1,
                        color=(0.2, 0.2, 0.2),
                        zorder=0,
                        capstyle='butt')
    axes[0].add_collection(lc)
    axes[0].scatter(np.random.uniform(0.1, 0.9, mean_values_dom.size),
                    mean_values_dom,
                    s=5,
                    facecolor=(0.5, 0.5, 0.5),
                    edgecolor='k',
                    lw=0.4)
    axes[0].scatter(np.random.uniform(4.1, 4.9, mean_values_sub.size),
                    mean_values_sub,
                    s=5,
                    facecolor=(0.5, 0.5, 0.5),
                    edgecolor='k',
                    lw=0.4)
    axes[0].plot([1, 4],
                 [metric_dom.mean(), metric_sub.mean()],
                 '--',
                 color='k',
                 solid_capstyle='butt')
    axes[0].scatter([0.5, 4.5],
                    [metric_dom.mean(), metric_sub.mean()],
                    s=20,
                    marker='o',
                    facecolor=np.array([dom_color, sub_color]),
                    edgecolor='k')
    axes[0].set_ylabel(ylabel, fontsize=14)
    axes[0].set_xticks([0.5, 4.5])
    axes[0].set_xticklabels([r'$Dom$', r'$Sub$'], fontsize=14)
    differences = mean_values_dom - mean_values_sub
    pdf = gaussian_kde(differences)
    padding = (differences.max() - differences.min())
    x = np.linspace(differences.min() - padding,
                    differences.max() + padding, 1000)
    cdf = np.cumsum(pdf(x)) * np.diff(x)[0]
    left = np.argwhere(cdf <= alpha_level / 2).ravel().max()
    right = np.argwhere(cdf >= 1 - alpha_level / 2).ravel().min()
    observed = metric_dom.mean() - metric_sub.mean()
    idx = np.argmin(np.abs(x - observed))
    # calculate p value ( * 2) because two-sided
    if np.abs(observed - x[left]) <= np.abs(observed - x[right]):
        p_value = 2 * cdf[idx + 1]
    else:
        p_value = 2 * (1 - cdf[idx])
    axes[1].hist(mean_values_dom - mean_values_sub,
                 bins=30,
                 density=True,
                 facecolor=(0, 0, 0, 0.1),
                 edgecolor=(0, 0, 0, 0.4))
    axes[1].fill_between(x[:left + 1],
                         pdf(x[:left + 1]),
                         facecolor='#7CB939',
                         alpha=0.75)
    axes[1].fill_between(x[left:right + 1],
                         pdf(x[left:right + 1]),
                         facecolor='k',
                         alpha=0.25)
    axes[1].fill_between(x[right:],
                         pdf(x[right:]),
                         facecolor='#7CB939',
                         alpha=0.75)
    axes[1].plot([x[left]] * 2, [0, pdf(x[left])],
                 c='k',
                 alpha=0.75,
                 lw=0.5,
                 solid_capstyle='butt')
    axes[1].plot([x[right]] * 2, [0, pdf(x[right])],
                 c='k',
                 alpha=0.75,
                 lw=0.5,
                 solid_capstyle='butt')
    axes[1].plot(x, pdf(x), alpha=1, lw=0.5, c='k', solid_capstyle='butt')
    axes[1].axvline(observed,
                    linestyle='--',
                    color='k',
                    solid_capstyle='butt',
                    ymax=0.9)
    axes[1].set_xlim(xlim_hist)
    axes[1].set_xlabel('mean difference', fontsize=14)
    axes[1].set_ylabel('density', fontsize=14)
    for ax in axes.ravel():
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
    fig.tight_layout()
    plt.show()
    return 'p-value: {}'.format(p_value)
Пример #49
0
    def _create_histogram_distribution(self,
                                       df,
                                       min_x=None,
                                       max_x=None,
                                       extend_x_proportion_percentage=20,
                                       postfix_label=None,
                                       obs_weights=None,
                                       denormalised=True):

        # get min/max values for our histogram
        min_hist_x = df.min()
        max_hist_x = df.max()

        extend_x_proportion_percentage = 1.0 + (
            float(extend_x_proportion_percentage) / 100.0)

        # extend axes for PDF, so just outside histogram
        if min_x is not None:
            min_x = min(min_x, min_hist_x) * extend_x_proportion_percentage
        else:
            min_x = min_hist_x

        if max_x is not None:
            max_x = max(max_x, max_hist_x) * extend_x_proportion_percentage
        else:
            max_x = max_hist_x

        if denormalised: density = False

        vals = df.T.values.astype(np.float64)

        # Create a histogram with 10 buckets
        hist, bins = np.histogram(vals,
                                  bins=10,
                                  range=[float(min_hist_x),
                                         float(max_hist_x)],
                                  density=density,
                                  weights=obs_weights)
        bin_cent = (bins[1:] + bins[:-1]) * 0.5

        number_of_elements = len(df.values)

        dist_space = np.linspace(min_x, max_x, 100)

        if postfix_label is None:
            postfix_label = ''
        else:
            postfix_label = ": " + postfix_label

        if number_of_elements > 1:

            # Create a best fit PDF using Gaussian KDE model (forcibly cast to float64)
            if obs_weights is None:
                kde = gaussian_kde(vals)
            else:
                kde = gaussian_weighted_kde(vals,
                                            weights=obs_weights.values.astype(
                                                np.float64))

            # Sometimes need to transpose so the dimensions are consistent
            try:
                pdf_fit = kde(dist_space)
            except:
                pdf_fit = kde(dist_space.T)

            if obs_weights is None:
                # Calculated normal PDF
                weighted_stats = DescrStatsW(df.values, ddof=0)
            else:
                weighted_stats = DescrStatsW(df.values,
                                             weights=obs_weights.T.values,
                                             ddof=0)

            mu = weighted_stats.mean
            std = weighted_stats.std

            normal_pdf_fit = norm.pdf(dist_space, mu, std)

            # Scale pdf_fit (and normal PDF) by total/bin size
            if denormalised:
                bin_width = abs(bins[1] - bins[0])
                N = np.sum(hist)
                pdf_fit = pdf_fit * (bin_width * N)
                normal_pdf_fit = normal_pdf_fit * (bin_width * N)

            df_hist = pd.DataFrame(index=bin_cent,
                                   data=hist,
                                   columns=['Histogram' + postfix_label])
            df_pdf = pd.DataFrame(index=dist_space,
                                  data=pdf_fit,
                                  columns=['KDE-PDF' + postfix_label])
            df_pdf['Norm-PDF' + postfix_label] = normal_pdf_fit
        else:
            return pd.DataFrame(), pd.DataFrame()

        return df_hist, df_pdf
Пример #50
0
classNames = ['Non diabetes', 'Diabetes']
N, M = X.shape
C = len(classNames)

# Draw samples from mixture of gaussians (as in exercise 11.1.1), add outlier
#N = 1000; M = 1
#x = np.linspace(-10, 10, 50)
#X = np.empty((N,M))
#m = np.array([1, 3, 6]); s = np.array([1, .5, 2])
#c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3])
#for c_id, c_size in enumerate(c_sizes):
#    X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M))
#X[-1,0]=-10 # added outlier

# Compute kernel density estimate
kde = gaussian_kde(X.ravel())

scores = kde.evaluate(X.ravel())
idx = scores.argsort()
scores.sort()

print('The index of the lowest density object: {0}'.format(idx[0]))

# Plot kernel density estimate
figure()
bar(range(20), scores[:20])
title('Outlier score')
show()

print('Ran Exercise 11.3.1')
Пример #51
0
        obs = obs.loc[filter_classes]

        nbins = 40
        x, y = obs.values, mod.values

        xi, yi = np.mgrid[x.min():x.max():nbins * 1j,
                          y.min():y.max():nbins * 1j]
        zi = np.zeros_like(xi) * np.nan
        for ibin in range(nbins):
            xmin = x.min() + ibin * (x.max() - x.min()) / nbins
            xmax = xmin + (x.max() - x.min()) / nbins
            in_bin = ((x >= xmin) & (x < xmax))
            ybin = y[in_bin]
            xbin = x[in_bin]
            if len(ybin) > 20:
                k = kde.gaussian_kde((ybin))
                zi[ibin] = k(np.vstack([yi[ibin].flatten()]))
        zi = zi / np.sum(zi, axis=1)[:, np.newaxis]
        zi_int = zi.cumsum(axis=1)
        #  label=key+", "+\
        #                    'R = '+str(round(PR[0],3))+', '+\
        #                    'RMSE = '+str(round(RMSE,5))+', '+\
        #                    'BIAS = '+str(round(BIAS,5)),s=1.,color=colors[ikey])
        axes[varkey].contour(xi,
                             yi,
                             zi_int.reshape(xi.shape),
                             levels=[0.16, 0.5, 0.84],
                             colors=['darkred', 'lightgreen', 'darkred'],
                             linewidths=[1, 2, 1])
        axes[varkey].contourf(
            xi,
Пример #52
0
def degree_distn(G_list,
                 cost,
                 group_list,
                 title,
                 figure_name,
                 measure,
                 option='hist'):
    '''
    This can be used to plot either a histogram or a KDE function
    by changing the option from either 'hist' or 'kde'
    '''
    # Create the figure
    fig, ax = plt.subplots(figsize=(6, 4))

    degrees_list = []

    for G in G_list:
        # Degree only has meaning if you don't have a full graph!
        # So while we'll *call* those values "degrees" it actually
        # represents strength...but only for the cost=100 graph
        if cost < 100:
            # Binarize the graph
            for u, v, d in G.edges(data=True):
                d['weight'] = 1

        # Get the degrees of the graph
        degrees = G.degree(weight='weight').values()
        degrees_list += [degrees]

    if option == 'hist':
        # The ranges are different for the different costs
        # They're hardwired here
        if cost > 15:
            x = np.arange(0, 180, 10)
        if cost == 10:
            x = np.arange(0, 100, 10)
        if cost == 02:
            x = np.arange(0, 50, 5)

        color_list = [color_dict[group] for group in group_list]

        # Plot the histogram
        ax.hist(degrees_list,
                bins=x,
                color=color_list,
                normed=1,
                label=group_list)

    elif option == 'kde':
        for degrees, group in zip(degrees_list, group_list):
            # Calculate and plot the kde function
            pdf = gaussian_kde(degrees)

            # The ranges are different for the different costs
            # They're hardwired here
            if cost > 15:
                x = np.arange(0, 180, 1)
            if cost == 10:
                x = np.arange(0, 100, 1)
            if cost == 02:
                x = np.arange(0, 50, 1)

            ax.plot(x, pdf(x), color=color_dict[group], label=group)

    # Set the appropriate x and y limits
    if cost == 100:
        ax.set_xlim((0, 180))
        ax.set_ylim((0, 0.02))
    if cost == 20:
        ax.set_xlim((0, 180))
        ax.set_ylim((0, 0.015))
    if cost == 10:
        ax.set_xlim((0, 100))
        ax.set_ylim((0, 0.025))
    if cost == 2:
        ax.set_xlim((0, 50))
        ax.set_ylim((0, 0.08))

    if len(G_list) > 1:
        ax.legend(loc='upper left', framealpha=0.0, title=measure.upper())

    fig.savefig(figure_name, bbox_inches=0, dpi=300)
    plt.close(fig)
Пример #53
0
def residual(pred, obs, x):
    """
    This function analyzes the residual between predicted values and observed values. Given the predicted and \
    observed values, this function does the following:

    #. Compute the empirical cumulative distribution function (CDF) between the predicted and observed data \
    in units [quantile vs hours]
    #. Compute the residual in the CDF between observed and predicted data

        .. math::
            r(x) = cdf_{observed}(x) - cdf_{predicted}(x)

    #. Invert the residual so that the CDFs and residuals are in units [minutes vs quantile]

    :param numpy.ndarray pred: the predicted (ABMHAP) values used to make the empirical CDF
    :param numpy.ndarray obs: the observed (CHAD) values used to make the empirical CDF
    :param numpy.ndarray x: the x-values
    :param bool do_scaling: this scales the inverted cdf residual by the standard deviation of the observed values

    :return: the data for the cumulative distribution data (predicted, observed, residual, and scaled residual), \
    the data for the inverted cumulative distribution data (predicted, observed, residual, and scaled residual)
    :rtype: pandas.core.frame.DataFrame, pandas.core.frame.DataFrame
    """

    #
    # CDF
    #

    # smooth probability density functions
    f_obs = kde.gaussian_kde(obs)
    f_pred = kde.gaussian_kde(pred)

    # the density vectors
    d_obs = f_obs(x)
    d_pred = f_pred(x)

    # the cumalative distribution functions
    cdf_obs = integrate.cumtrapz(y=d_obs, x=x, initial=0)
    cdf_pred = integrate.cumtrapz(y=d_pred, x=x, initial=0)

    # the residual in the CDFs
    res = cdf_obs - cdf_pred
    res_scaled = res / np.std(cdf_obs)

    #
    # the inverted CDF
    #

    # create functions that represent the inverted cdf
    f_inv_obs = interpolate.interp1d(x=cdf_obs, y=x)
    f_inv_pred = interpolate.interp1d(x=cdf_pred, y=x)

    # the probability
    p_max = min(cdf_obs.max(), cdf_pred.max())
    p = np.linspace(0, p_max, num=len(x))

    # the inverted of the CDF
    cdf_inv_obs = f_inv_obs(p)
    cdf_inv_pred = f_inv_pred(p)

    res_inv = (cdf_inv_obs - cdf_inv_pred) * (-1)
    res_inv_scaled = res_inv / np.std(obs)

    #
    # Output
    #

    # combine all of the information into a data frame
    y_data = {
        'pred': cdf_pred,
        'obs': cdf_obs,
        'res': res,
        'res_scale': res_scaled
    }
    y_inv_data = {
        'pred': cdf_inv_pred,
        'obs': cdf_inv_obs,
        'res': res_inv,
        'res_scale': res_inv_scaled
    }

    # the cumulative distribution data
    cdf = pd.DataFrame(y_data)

    # the inverted cumulative distribution data
    inv_cdf = pd.DataFrame(y_inv_data)

    return cdf, inv_cdf
Пример #54
0
    nbins = 20

    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(12, 5))

    #axes[0].set_title('Trajectory')
    axes[0].set(title='Trajectory',
                xlim=(0, args.frameWidth),
                xticks=list(range(0, args.frameWidth, 60)),
                ylim=(0, args.frameHeight),
                yticks=list(range(0, args.frameHeight, 60)))

    axes[0].plot(x, y, 'k')

    # Evaluate a gaussian the kernel density estimation on a regular grid of nbins x nbins
    k = kde.gaussian_kde(points.T)
    xi, yi = np.mgrid[x.min():x.max():nbins * 1j, y.min():y.max():nbins * 1j]
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))

    # Plot density with shading
    #axes[1].set_title('Heatmap')
    axes[1].set(title='Heatmap',
                xticks=list(range(0, args.frameWidth, 60)),
                yticks=list(range(0, args.frameHeight, 60)))

    pc = axes[1].pcolormesh(xi,
                            yi,
                            zi.reshape(xi.shape),
                            shading='gouraud',
                            cmap=plt.cm.jet)
    if source.data.where.startswith('gal'):
        if 'pwn' or 'unid' in source.data.classes:
            try:
                gammacat_pwn_glon.append(source.spatial_model().lon_0.value)
                gammacat_pwn_glat.append(source.spatial_model().lat_0.value)
            except:
                None

gammacat_pwn_glat = np.array(gammacat_pwn_glat)
gammacat_pwn_glon = np.array(gammacat_pwn_glon)
gammacat_pwn_glon = np.concatenate([
    gammacat_pwn_glon[gammacat_pwn_glon > 180] - 360,
    gammacat_pwn_glon[gammacat_pwn_glon < 180]
])

k = kde.gaussian_kde(np.array([gammacat_pwn_glon, gammacat_pwn_glat]))
nbins = 200
xi, yi = np.mgrid[gammacat_pwn_glon.min():gammacat_pwn_glon.max():nbins * 1j,
                  gammacat_pwn_glat.min():gammacat_pwn_glat.max():nbins * 1j]
zi = k(np.vstack([xi.flatten(), yi.flatten()]))
zi /= zi.max()

glat = final.GLAT
glon = final.GLON
glon = np.concatenate([glon[glon > 180] - 360, glon[glon < 180]])

k1 = kde.gaussian_kde(np.array([glon, glat]))
nbins = 200
xi1, yi1 = np.mgrid[glon.min():glon.max():nbins * 1j,
                    glat.min():glat.max():nbins * 1j]
zi1 = k1(np.vstack([xi1.flatten(), yi1.flatten()]))
Пример #56
0
    plt.ylabel(ylabes[ip], fontsize=12)
    plt.xlabel('Years', fontsize=12)

    x0 = np.linspace(0.5, 6, 40)
    y0 = np.exp(uMAP + betaMAP + beta1MAP[ip] * xl + beta2MAP[ip] * elec_Pca_char1[ip * 42:(ip * 42 + 40)] + \
                beta3MAP[ip] * elec_Pca_char2[ip * 42:(ip * 42 + 40)] + + beta4MAP[ip] * xl * xl)

    # Posterior sample from the trace
    #     for ips in np.random.randint(burnin, 3000, ppcsamples):
    #         param = trace[ips]
    #         yl2 = np.exp(param['beta'][ip] + param['beta1'][ip] * (xl) + param['beta2'][ip]*elec_Pca_char1[ip*42:(ip*42+40)] + \
    #                      param['beta3'][ip]*elec_Pca_char2[ip*42:(ip*42+40)])
    #         ax0.plot(xl, yl2, 'k', linewidth=2, alpha=.05)

    ax1 = plt.subplot(gs[1 + ip * 3])
    my_pdf1 = gaussian_kde(kde_beta2[:, ip])
    x1 = np.linspace(-8, 8, 300)
    ax1.plot(x1, my_pdf1(x1), 'k', lw=2.5, alpha=0.6)
    plt.xlim((-8, 8))
    plt.xlabel(r'$\beta2$', fontsize=15)
    plt.ylabel('Posterior Density', fontsize=12)

    ax2 = plt.subplot(gs[2 + ip * 3])
    my_pdf2 = gaussian_kde(kde_beta3[:, ip])
    x2 = np.linspace(-6, 6, 300)
    ax2.plot(x2, my_pdf2(x2), 'k', lw=2.5, alpha=0.6)
    plt.xlim((-6, 6))
    plt.xlabel(r'$\beta3$', fontsize=15)
    plt.ylabel('Posterior Density', fontsize=12)
    plt.title('Subject %s' % (ip + 1))
Пример #57
0
def get_normalized_principle_moment_ratios():
    #molecules = zinc_smiles.smiles
    #m1=[]
    #for file in sorted(glob.glob("/content/drive/MyDrive/zinc15_new/mol_files/add_hydrogen/*.mol")):

    #name = (file.split('.')[0]).split('/')[-1]

    # m = Chem.MolFromMolFile(file)
    #m1.append(m)
    # shuffle the molecules before plotting
    molecules = [
        Chem.MolFromMolFile(mol)
        for mol in sorted(glob.glob("../../data/coformer1/*.mol"))
    ]  # sorted(glob.glob("/content/drive/MyDrive/zinc15_new/mol_files/add_hydrogen/*.mol")) ] #[mol for mol in m1]
    #name = [(file.split('.')[0]).split('/')[-1] for file in sorted(glob.glob("/content/zinc20_updated/*"))]
    #print(name)
    #if name == 'ZINC000085548520':
    # print(zinc20_lumo_dict['ZINC000085548520'])

    print(len(molecules))
    #random.shuffle(molecules)

    # create a list of all the NPRs
    npr1 = list()
    npr2 = list()
    fails = 0
    n_mols = 0
    for mol in molecules:
        try:
            #mol = Chem.AddHs(Chem.MolFromSmiles(smile))
            #AllChem.EmbedMolecule(mol)  # generate a 3D embedding
            npr1.append(rdkit.Chem.Descriptors3D.NPR1(mol))
            npr2.append(rdkit.Chem.Descriptors3D.NPR2(mol))
            n_mols += 1
            #print(npr2)
        except:
            fails += 1
            print(mol)
        if n_mols == 10000:
            print("-- Truncating at 10K")
            break

    print(len(npr1))
    print(len(npr2))
    nbins = 30
    k = kde.gaussian_kde((npr1, npr2))
    xi, yi = np.mgrid[0:1:nbins * 1j, 0.5:1:nbins * 1j]
    zi = k(np.vstack([xi.flatten(), yi.flatten()]))

    # plot the NRP on a 2D map
    fig = plt.figure(figsize=(10, 8))
    fig.patch.set_facecolor('white')
    plt.rcParams['axes.facecolor'] = 'white'
    plt.grid(False)

    plt.rcParams['axes.spines.top'] = True
    plt.rcParams['axes.spines.bottom'] = True
    plt.rcParams['axes.spines.left'] = True
    plt.rcParams['axes.spines.right'] = True
    #c=[zinc20_homo_dict[i] for i in name]
    #print(max(c))
    #print(min(c))
    #facecolors = [cm.viridis(x) for x in c]
    plt.hexbin(
        npr1, npr2
    )  #, gridsize=nbins, C=zi,   cmap=plt.cm.jet_r, mincnt=1, extent=(0, 1, 0.5, 1), alpha=0.8, zorder=6)#, vmin=0, vmax=150, zorder=0)
    cbar = plt.colorbar()
    cbar.ax.tick_params(labelsize=15)
    #cbar.set_label('kernel density', fontsize=16)
    #cbar.set_label('LUMO$_{+1}$-LUMO$_{+2}$ degeneracy', fontsize=16)
    cbar.set_label('H**O-LUMO degeneracy', fontsize=16)

    #plt.contour(xi, yi, zi.reshape(xi.shape), levels=5, zorder=1)
    plt.fill([0, 0, 0.5], [0.5, 1, 0.5], "white",
             zorder=2)  # `white out' the bottom left corner of the plot
    plt.fill([1, 1, 0.5], [0.5, 1, 0.5], "white",
             zorder=3)  # `white out' the bottom right corner of the plot
    plt.plot([0, 0.5], [1, 0.5],
             color="lightsteelblue",
             linewidth=3.5,
             zorder=4)
    plt.plot([0.5, 1], [0.5, 1],
             color="lightsteelblue",
             linewidth=3.5,
             zorder=5)
    plt.plot([0, 1], [1, 1], color="lightsteelblue", linewidth=3.5, zorder=0)
    #plt.axvline(x=3.5, alpha=0.5)
    plt.ylabel("NPR2", fontsize=16)
    plt.xlabel("NPR1", fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    matplotlib.rc('axes', edgecolor='black')
    #ax.spines['bottom'].set_color('black')
    #ax.spines['top'].set_color('black')
    #ax.spines['right'].set_color('black')
    #ax.spines['left'].set_color('black')
    #plt.plot(loss.Epochs.values, loss.Column7.values, '-o')
    plt.ylim(0.4, 1.05)
    plt.xlim(-0.05, 1.05)
    plt.savefig("../../data/figures/npr_mol1.png",
                dpi=600,
                bbox_inches='tight')
    #print("-- File saved in ", smi_file[:-4] + "_npr.png")

    # return the values
    return npr1, npr2, fails
if contour:
    yeardays = raw.groupby(raw.index.dayofyear).mean()
    weights = aggregation.clusterPeriodNoOccur
    yeardays["cluster_str"] = [
        f"Cluster {i+1} ({weights[i]} days)" for i in yeardays["cluster"].values
    ]
    nbins = 100

    fig = go.Figure()
    c = sns.color_palette("cubehelix", n_colors=len(typPeriods_m.loc[:, 1, :]))

    for i, (cluster, df) in enumerate(yeardays.groupby("cluster_str")):

        x = df.PV.values
        y = df.Wind.values
        k = kde.gaussian_kde([x, y])
        xi, yi = np.mgrid[x.min() : x.max() : nbins * 1j, y.min() : y.max() : nbins * 1j]
        zi = k(np.vstack([xi.flatten(), yi.flatten()]))
        xc = np.linspace(x.min(), x.max(), nbins)
        yc = np.linspace(y.min(), y.max(), nbins)

        colorscale = [
            [0, "rgba(0,0,0,0)"],
            [0.5, f"rgba{c[i][0], c[i][1], c[i][2], 0}"],
            [1, colors[i]],
        ]
        fig.add_trace(
            go.Contour(
                x=xc,
                y=yc,
                z=zi.reshape(xi.shape),
Пример #59
0
    new_data = {k: len(v) / numberOfSearchQueries for k, v in new.items()}
    import matplotlib.pylab as plt
    plt.figure(figsize=(15, 5))
    plt.bar(new_data.keys(), new_data.values(), width=.5, color='g')
    plt.xlabel("Path Length")
    plt.ylabel("Probability")
    plt.title("Path Length vs Probability for N=" + str(i))
    plt.savefig("pathLengthProbability_" + str(i) + ".eps", format="eps")

for i in numnodes:
    out = []
    fig1 = plt.figure()
    for k, v in totalData[i][0].items():
        out.append(v)
    kde = gaussian_kde(out)
    mu = np.mean(out)
    variance = np.var(out)
    sigma = math.sqrt(variance)
    dist_space = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
    plt.plot(dist_space, stats.norm.pdf(dist_space, mu, sigma))
    plt.xlabel("Path Length")
    plt.ylabel("PDF")
    plt.title("Path Length PDF")
    plt.savefig("pathLengthpdf_" + str(i) + ".eps", format="eps")

print("--------------plots after deletion---------------")
mapAverageHopsMean = {}
mapAverageHopsStd = {}
mapAverageHops = {}
for key, value in totalDataAfterDeletion.items():
Пример #60
0
    def run(self, samples=None, progress=True):
        """
        Perform MCMC calibration. Returns pdfs and MCMC traces.

        Args:
          samples(integer or tuple): A tuple containing the
            number of samples, the number to burn, and the number to thin.
            If samples is an integer, burn will be 20% of the samples and
            thin will be 8.  Default will use between 10000 and 1000000
            samples, depending on the number of stochastic variables
            being calibrated.

          progress(boolean): If True, will display a progress bar.

        Returns(tuple):
          Returns a tuple containing cvars and a pdf.
          cvars is modified to include key 'trace'
          which will be an array.  It will also have a key 'pdf' which
          will be a PDF function.  For GAUSSIAN type, it will also
          include traces 'mtrace' and 'dtrace' and 'jpdf' corresponding
          to the mean and deviation traces and the joint PDF.

        """
        if samples is None:
            num_samples = self.num_samples
            num_burn = self.num_burn
            num_thin = self.num_thin
        else:
            if type(samples) == tuple:
                if len(samples) != 3:
                    raise ValueError(
                        "Error: samples should be a number or tuple of length 3."
                    )
                num_samples, num_burn, num_thin = samples
            else:
                num_samples = samples
                num_burn = int(samples * 0.20)
                num_thin = 8

        Calibrate.mcmc = pymc.MCMC(self.mcmc_model)
        Calibrate.mcmc.sample(iter=num_samples,
                              burn=num_burn,
                              thin=num_thin,
                              tune_interval=10000,
                              tune_throughout=True,
                              progress_bar=progress)

        if Calibrate.mcmc is None:
            return None

        for v in self.cvars.keys():
            t = self.var[v].trace[:]
            if len(t.shape) == 2:
                self.cvars[v]['ntraces'] = t.shape[1]
            else:
                self.cvars[v]['ntraces'] = 1
            self.cvars[v]['trace'] = t.ravel()

        for v in self.means.keys():
            self.cvars[v]['mtrace'] = self.means[v].trace[:]
            self.cvars[v]['dtrace'] = self.devs[v].trace[:]

        # collect all the independent variables and compute KDE
        col_count = max([self.cvars[v]['ntraces'] for v in self.cvars])
        for cv in self.cvars.keys():
            if self.cvars[cv]['type'] == 'S':
                data = np.column_stack(
                    (self.cvars[cv]['dtrace'], self.cvars[cv]['mtrace']))
                try:
                    self.cvars[cv]['jpdf'] = gaussian_kde(data.T)
                except:
                    self.cvars[cv]['jpdf'] = None
            # multidimensional traces get flattened and others
            # get repeated to match size.
            if self.cvars[cv]['ntraces'] == col_count:
                n = 1
            else:
                n = col_count
            try:
                self.cvars[cv]['pdf'] = gaussian_kde(
                    self.cvars[cv]['trace'].ravel())
            except:
                self.cvars[cv]['pdf'] = None
            self.cvars[cv]['trace'] = self.cvars[cv]['trace'].ravel().repeat(n)

        data = np.column_stack(
            [self.cvars[v]['trace'] for v in sorted(self.cvars.keys())])
        try:
            k = gaussian_kde(data.T)
        except:
            k = None

        return (self.cvars, k)