Пример #1
0
def Bdist(*args):
    '''binary distance matrix:
    dist(X)    -  return matrix of size (len(X),len(X)) all True!
    dist(X1,X2)-  return matrix of size (len(X1),len(X2)) with (xi==xj)'''

    if (len(args) == 1):
        return true
        X = args[0]
        Y = args[0]
        #X = N.matrix(args[0])
        #Y = N.matrix(args[0])
    elif (len(args) >= 2):
        X = args[0]
        Y = args[1]
        #X = N.matrix(args[0])
        #Y = N.matrix(args[1])

    rv = N.zeros((len(X), len(Y)), 'double')

    A = N.repmat(X, 1, len(Y))
    B = N.repmat(Y.T, len(X), 1)

    rv = (A & B)

    return rv
Пример #2
0
def SIMPLEmultinomLnL(c, alpha, presTime, recTime, J, k):
    '''
    ARGS
        c: a parameter of SIMPLE
        a: a parameter of SIMPLE
        presTime: time separating onset of words during encoding
        recTime: time of separation during retrieval
        J: length of the list (ie, how many words?)
        k: a matrix where each row i corresponds to an output position,
            and element j in the row gives number of times item j was
            recalled at position i
    RETURNS
        lnL: the predicted proportion of correct responses at each position
    '''

    lnL = np.zeros([1, J])
    Ti = np.cumsum(np.repmat(presTime, 1, J))
    Tr = Ti[-1] + np.cumsum(np.repmat(recTime, 1, J))

    for i in range(J):  # i indexes output + probe position
        M = np.log(Tr[i] - Ti)
        eta = np.exp(-c * abs(M(i) - M)**alpha)
        pall = eta / sum(eta)
        lnL[i] = sum(k[i, :] * np.log(pall))

    return lnL
Пример #3
0
def Bdist(*args):
    '''binary distance matrix:
    dist(X)    -  return matrix of size (len(X),len(X)) all True!
    dist(X1,X2)-  return matrix of size (len(X1),len(X2)) with (xi==xj)'''
    
    
    if(len(args)==1):
        return true
        X  = args[0]
        Y  = args[0]
        #X = N.matrix(args[0])
        #Y = N.matrix(args[0])
    elif(len(args)>=2):
        X  = args[0]
        Y  = args[1]
        #X = N.matrix(args[0])
        #Y = N.matrix(args[1])
        
        
    rv = N.zeros((len(X),len(Y)),'double')
    
    A = N.repmat(X,1,len(Y))
    B = N.repmat(Y.T,len(X),1)
    
    rv = (A&B)
    
    return rv
Пример #4
0
def MeshGrid(xs, ys):
    '''immitate matlab's meshgrid to generate a 2d mesh
    '''

    xs = arr(xs)
    ys = arr(ys)
    X = repmat(xs, (ys.size, 1))
    Y = repmat(col(ys), (1, xs.size))

    return (X, Y)
def initializeBPmessage(BPalg, nodePot):
    G = BPalg['G']

    if (BPalg['options']['eqnStates']):
        #% when all nodes have same number of states 'maxState'
        #% both approaches are equivalent, but for speed purpose we should use this
        #% uniform distribution
        unif_msg = np.ones((G['maxState'], 2 * G['nEdges'])) / G['maxState']
        BPalg['init_message'] = copy.copy(
            unif_msg)  # use b = copy.copy(a) instead of 'a = b'

        if BPalg['options']['verbose']:
            print 'Initialized messages from all nodes to their respective neighbors.'
    else:
        #%if variable number of states or would like to initiate different
        #%messages for each node
        BPalg['init_message'] = np.zeros((G['maxState'], 2 * G['nEdges']))
        nis = G['Edges'][:, 0]
        njs = G['Edges'][:, 1]
        eij = range(G['nEdges'])
        eji = eij + G['nEdges']

        #% repmat creates variable size column vectors (1:nState(i)) and then rest of the column (nState(i):maxState) is zeros;
        # Mij = arrayfun(@(s) [repmat(1/s,s,1) ; zeros(G.maxState-s,1)],G.nStates(njs),'UniformOutput',false)
        # Mji = arrayfun(@(s) [repmat(1/s,s,1) ; zeros(G.maxState-s,1)],G.nStates(nis),'UniformOutput',false)
        Mij = np.apply_along_axis(
            lambda x: np.vstack(
                (np.repmat(1. / (x + 1),
                           (x + 1, 1)), np.zeros(G['maxState'] - x))), 1,
            G['nState'][njs])
        Mji = np.apply_along_axis(
            lambda x: np.vstack(
                (np.repmat(1. / (x + 1),
                           (x + 1, 1)), np.zeros(G['maxState'] - x))), 1,
            G['nState'][nis])

        #BPalg.init_message(:,eij) =  cell2mat(Mij) #% from node i to j
        #BPalg.init_message(:,eji) =  cell2mat(Mji) #% from node j to i
        BPalg['init_message'][:, eij] = Mij.tolist()
        BPalg['init_message'][:, eji] = Mji.tolist()

        if BPalg['options']['verbose']:
            print 'Initialized messages from all nodes to their respective neighbors.'

    BPalg['new_message'] = copy.copy(
        unif_msg)  # use b = copy.copy(a) instead of 'a = b'
    BPalg['old_message'] = copy.copy(
        unif_msg)  # use b = copy.copy(a) instead of 'a = b'

    #% When intialized this is set to zero for the iteration to run more than
    #% once.
    BPalg['convergence'] = 0

    return BPalg
Пример #6
0
    def logprob(self, mc, S):
        if empty(S):
            lP = []
            return

        if np.any(S < 0) or np.any(S != round(S)):
            lP = np.repmat(-Inf, mc.shape)
            return

        lp = np.zeros((mc.shape))
        fromS = S[0:-2]
        toS = S[1:-1]
        for i in range(len(mc)):
            if S[0] > len(mc[i].initialProb):
                lP[i] = -float('inf')
            else:
                lP[i] = np.log(mc[i].initialProb[S[0]])

            if not fromS:
                if max(fromS) > mc[i].getNStates(
                ) or S[-1] > mc[i].transitionProb.shape[1]:
                    lP[i] = -float('inf')
                else:
                    dims = (mc[i].transitionProb.shape, S.shape[0], S.shape[1])
                    iTrans = np.ravel_multi_index(
                        (mc[i].transitionProb.shape, fromS, toS),
                        dims=dims,
                        order='F')
                    lP[i] = lP[i] + np.sum(np.log(
                        mc[i].transitionProb[iTrans]))

        return lP
Пример #7
0
 def __init__(self, pMass):
     self.pD = dict()
     if isinstance(pMass, DiscreteD):
         self.pD['probMass'] = pMass
     else:
         if pMass.shape[0] == 1 or pMass.shape[1] == 1:
             self.pD['probMass'] = pMass
         else:
             self.pD = np.repmat(pD, pMass.shape[0], 1)
             for i in range(pMass.shape[0]):
                 self.pD[i, 1]['probMass'] = pMass[i, :].T
Пример #8
0
def compLSsuffstats_fourier(x,y,dims,minlens,nxcirc = None,condthresh = 1e8):
	# Compute least-squares regression sufficient statistics in DFT basis
	# Python version of this NOT complete 9/15/17!
	#
	# [dd,wwnrm,Bfft] = compLSsuffstats_fourier(x,y,dims,minlens,nxcirc,condthresh)
	#
	# INPUT:
	# -----
	#           x [n x p] - stimulus, where each row vector is the spatial stim at a single time
	#        dims [m x 1] - number of coefficients along each stimulus dimension
	#     minlens [m x 1] - minimum length scale for each dimension (can be scalar)
	#      nxcirc [m x 1] - circular boundary in each stimulus dimension (minimum is dims) OPTIONAL
	#  condthresh [1 x 1] - condition number for thresholding for small eigenvalues OPTIONAL
	#
	# OUTPUT:
	# ------
	#     dd (struct) - carries sufficient statistics for linear regresion
	#  wwnrm [nf x 1] - squared "effective frequencies" in vector form for each dim
	#   Bfft  {1 x p} - cell array with DFT bases for each dimension

	# 1e8 is default value (condition number on prior covariance)


	dims = np.array(np.reshape(dims,(1,-1)))
	minlens = np.array(np.reshape(minlens,(1,-1)))

	# Set circular bounardy (for n-point fft) to avoid edge effects, if needed
	if nxcirc is None:
	    #nxcirc = np.ceil(max([dims(:)'+minlens(:)'*4; dims(:)'*1.25]))'
	    nxcirc = np.ceil(np.max(np.concatenate((dims+minlens*4 ,dims*1.25), axis = 0), axis = 0))


	nd = np.size(dims) # number of filter dimensions
	if np.size(minlens) is 1: #% make vector out of minlens, if necessary
	    minlens = np.repmat(minlens,nd,1)


	# Determine number of freqs and make Fourier basis for each dimension
	# cdiagvecs = [None for x in range(nd)] # eigenvalues for each dimension
	# Bfft = [None for x in range(nd)] # Fourier basis matrix for each filter dimension
	# wvecs = [None for x in range(nd)] # Fourier frequencies for each filter dimension
	# ncoeff = np.zeros([nd,1])


	#fprintf('\ncompLSsuffstats_fourier:\n # filter freqs per stimulus dim:');
	# Loop through dimensions
	for jj  in np.arange(nd):
	    #careful here, the mkcov_ASDfactored function uses minlens and 1 as the lensc and rho params
	   	prs = [[minlens[0][jj],1],dims[0][jj]]
	    cdiagvecs[jj],Bfft[jj],wvecs[jj] = mkcovs.mkcov_ASDfactored(prs,nxcirc[jj], condthresh, compfftbasis= 1)
	    ncoeff[jj] = len(cdiagvecs[jj]) # number of coeffs
Пример #9
0
def rquad(N,k):
    k1=k+1; k2=k+2; n=array(range(1, N+1));  nnk=2*n+k;
    A=repmat(1.0*k**2,(1,N))/(nnk*(nnk+2));
    A=np.append(array([float(k)/k2]), A);
    n=array(range(1, N)); nnk=nnk[n]; n=n+1;
    B1=4*float(k1)/(k2*k2*(k+3)); nk=n+k; nnk2=nnk*nnk;
    B=4.0*(n*nk)**2/(nnk2*nnk2-nnk2);
    ab=np.append(array([2**float(k1)/k1, B1]), B);
    s=sqrt(ab[1:N]);
    x, v=eig(diag(A[0:N]) + diag(s, -1) + diag(s, +1))
    I, x=zip(*enumerate(x))
    x=array(x)
    x=(x+1)/2; w=((1.0/2)**(k1))*ab[0]*v[0,I]**2;
    xw=zip(x, w)
    xw.sort()
    return [array(x) for x in zip(*xw)]
Пример #10
0
def extract_from_file(file='',
                      varname='zeta',
                      extraction_type='full',
                      **kwargs):
    # Check the netCDF file for existence and if the variable is in it
    if not os.path.exists(file):
        raise IOError('File %s could not be located on the filesystem' % file)
    ncf = nc.Dataset(file, mode='r')

    if varname not in ncf.variables:
        raise IOError('File %s does not have a variable named %s' %
                      (file, varname))

    # start getting data
    ncvar = ncf.variables[varname]
    dims = ncvar.dimensions
    ndims = len(dims)
    shape = ncvar.shape
    #	print('var: %s, dims: %s, shape: %s' %(varname, str(dims), str(shape)))
    if not ndims == 3 and not ndims == 4:
        raise TypeError('ndims is neither 3 nor 4')
    if not dims[0] == 'ocean_time':
        raise TypeError('first dimension is not ocean_time')
    if not shape[0] == 1:
        raise TypeError('first dimension is not of length one')

    grid = load_grid.load_grid(file)
    coords = {}
    if ndims == 3:
        if dims[1] == 'eta_rho' and dims[2] == 'xi_rho':
            #			print('G.lat %s' % str(grid['lat']))
            #			print('G.lon %s' % str(grid['lon']))
            y2 = grid['lat'][:]
            x2 = grid['lon'][:]
            mask2 = grid['mask'][:]
        elif dims[1] == 'eta_u' and dims[2] == 'xi_u':
            #			print('G.latu %s' % str(grid['lat_u']))
            #			print('G.lonv %s' % str(grid['lon_u']))
            y2 = grid['latu'][:]
            x2 = grid['lonu'][:]
            mask2 = grid['masku'][:]
        elif dims[1] == 'eta_v' and dims[2] == 'xi_v':
            #			print('G.latv %s' % str(grid['lat_v']))
            #			print('G.lonv %s' % str(grid['lon_v']))
            y2 = grid['latv'][:]
            x2 = grid['lonv'][:]
            mask2 = grid['maskv'][:]
        else:
            raise TypeError('Unable to determine which gird to use')

        if extraction_type == 'full' or extraction_type == 'surface':
            #			data = np.squeeze(ncvar[:])
            #			data[mask2==0] = np.NaN
            data = np.ma.array(np.squeeze(ncvar[:]), mask=(mask2 == 0))

            coords['ym'] = y2
            coords['xm'] = x2

        if (extraction_type == 'profile' or extraction_type == 'profiles' or
            extraction_type == 'point' or extraction_type == 'points') and \
           (kwargs.has_key('y') and kwargs.has_key('x')):
            print('In profiles')
            xm = np.array(kwargs['x'])
            ym = np.array(kwargs['y'])
            if not (xm.ndim == ym.ndim) or not (xm.shape == ym.shape):
                if xm.size == 1:
                    xm = np.repmat(xm, ym.shape)
                elif ym.size == 1:
                    ym = np.repmat(ym, xm.shape)
#				else:
#					raise RuntimeError('The x and y chosen to extract a point or profile on this 2D variable are incompatible.')

            data2 = np.ma.array(np.squeeze(ncvar[:]), mask=(mask2 == 0))
            #			data = utils.interp_2d(lat=y2,lon=x2,data=data2,lati=ym,loni=xm)
            mask = utils.interp_2d_xy(y=y2, x=x2, data=mask2, yi=ym, xi=xm)
            data = utils.interp_2d_xy(y=y2, x=x2, data=data2, yi=ym, xi=xm)
            data = np.ma.array(data, mask=(mask < 1))
            coords['ym'] = ym
            coords['xm'] = xm

    if ndims == 4:
        xm = []
        ym = []
        zm = []

        K = shape[1]
        J = shape[2]
        I = shape[3]

        lat = grid['lat'][:]
        lon = grid['lon'][:]

        if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
            y2 = lat
            x2 = lon
            mask2 = grid['mask'][:]

        elif dims[2] == 'eta_u' and dims[3] == 'xi_u':
            y2 = grid['latu'][:]
            x2 = grid['lonu'][:]
            mask2 = grid['masku'][:]

        elif dims[2] == 'eta_v' and dims[3] == 'xi_v':
            y2 = grid['latv'][:]
            x2 = grid['lonv'][:]
            mask2 = grid['maskv'][:]
        else:
            raise TypeError(
                'Unable to determine which gird to use. dims[2] = %s, dims[3] = %s'
                % (dims[2], dims[3]))

        if dims[1] == 's_rho':
            cs = grid['cs'][:]
        elif dims[1] == 's_w':
            cs = grid['csw'][:]
        elif K == 1:
            cs = 0
        else:
            raise TypeError(
                'Unable to determine which cs to use. dim[1] = %s' % dim[1])

        data = []
        if extraction_type == 'full':
            # get zeta
            try:
                if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
                    zeta2 = ncf.variables['zeta'][:]
                else:
                    zeta2 = utils.interp_2d_xy(y=lat,
                                               x=lon,
                                               data=ncf.variables['zeta'][:],
                                               yi=y2,
                                               xi=x2)
            except Exception, e:
                print(e)
                zeta2 = np.zeros((len(y2), len(x2)))
            zeta2[zeta2 > 1000] = 0

            # get H
            if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
                H2 = grid['H'][:]
            else:
                H2 = utils.interp_2d_xy(y=lat,
                                        x=lon,
                                        data=grid['H'][:],
                                        yi=y2,
                                        xi=x2)

            x3 = np.tile(x2.reshape(1, J, I), (K, 1, 1))
            y3 = np.tile(y2.reshape(1, J, I), (K, 1, 1))
            mask3 = np.tile(mask2.reshape(1, J, I), (K, 1, 1))
            zeta3 = np.tile(zeta2.reshape(1, J, I), (K, 1, 1))
            H3 = np.tile(H2.reshape(1, J, I), (K, 1, 1))
            cs3 = np.tile(cs.reshape(K, 1, 1), (1, J, I))
            z3 = zeta3 + cs3 * (zeta3 + H3)

            zm = z3
            ym = y3
            xm = x3
            data = np.ma.array(np.squeeze(ncvar[:]), mask=(mask3 == 0))

        elif extraction_type == 'surface':
            # get zeta
            try:
                if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
                    zeta2 = ncf.variables['zeta'][:]
                else:
                    zeta2 = utils.interp_2d_xy(y=lat,
                                               x=lon,
                                               data=ncf.variables['zeta'][:],
                                               yi=y2,
                                               xi=x2)
            except Exception, e:
                print(e)
                zeta2 = np.zeros((len(y2), len(x2)))
            zeta2[zeta2 > 1000] = 0
            data = np.ma.array(np.squeeze(ncvar[0, K - 1, :, :]),
                               mask=(mask2 == 0))
            zm = zeta2
            ym = y2
            xm = x2
Пример #11
0
def conv_fourier(x, dims, minlens, nxcirc=None, condthresh=1e8):
    # Version of this NOT complete for higher dimensions 9/15/17!
    #
    # INPUT:
    # -----
    #           x [D x n x p] - stimulus, where each row vector is the spatial stim at a single time, D is number of batches
    #        dims [m x 1] - number of coefficients along each stimulus dimension
    #     minlens [m x 1] - minimum length scale for each dimension (can be scalar)
    #      nxcirc [m x 1] - circular boundary in each stimulus dimension (minimum is dims) OPTIONAL
    #  condthresh [1 x 1] - condition number for thresholding for small eigenvalues OPTIONAL
    #
    # OUTPUT:
    # ------
    #     Bx  - output data, x, in fourier domain
    #  wwnrm [nf x 1] - squared "effective frequencies" in vector form for each dim (normalized)
    #   Bfft  {1 x p} - cell array with DFT bases for each dimension (list of numpy arrays for each dimension)
    # 	1e8 is default value (condition number on prior covariance)

    dims = np.array(np.reshape(dims, (1, -1)))
    minlens = np.array(np.reshape(minlens, (1, -1)))

    # Set circular bounardy (for n-point fft) to avoid edge effects, if needed
    if nxcirc is None:
        #nxcirc = np.ceil(max([dims(:)'+minlens(:)'*4; dims(:)'*1.25]))'
        nxcirc = np.ceil(
            np.max(np.concatenate((dims + minlens * 4, dims), axis=0), axis=0))

    nd = np.size(dims)  # number of filter dimensions
    if np.size(
            minlens
    ) is 1 and nd is not 1:  #% make vector out of minlens, if necessary
        minlens = np.repmat(minlens, nd, 1)

    # generate here a list of your
    #None of these quantities depend on the data directly
    wvecs = [
        rffb.comp_wvec(nxcirc[jj], minlens[0][jj], condthresh)
        for jj in np.arange(nd)
    ]
    #cdiagvecs = [mkcovs.mkcovdiag_ASD(minlens[jj],1,nxcirc[jj],np.square(wvecs[jj]))  for jj in np.arange(nd)]
    Bffts = [
        rffb.realfftbasis(dims[jj], nxcirc[jj], wvecs[jj])[0]
        for jj in np.arange(nd)
    ]

    #fprintf('\n Total # Fourier coeffs represented: %d\n\n', prod(ncoeff));

    def f(switcher):
        # switch based on stimulus dimension
        if switcher is 2:
            pass
        if switcher is 3:
            pass
        return{
        1: #% 1 dimensional stimulus
             [np.square(2*np.pi/nxcirc[0]) * np.square(wvecs[0]), #normalized wvec
             np.ones([np.size(wvecs[0]),1])==1] #indices to keep 


        # 2: % 2 dimensional stimulus

        #     % Form full frequency vector and see which to cut
        #     Cdiag = kron(cdiagvecs{2},cdiagvecs{1});
        #     ii = (Cdiag/max(Cdiag))>1/condthresh; % indices to keep

        #     % compute vector of normalized frequencies squared
        #     [ww1,ww2] = ndgrid(wvecs{1},wvecs{2});
        #     wwnrm = [(ww1(ii)*(2*pi/nxcirc(1))).^2 ...
        #         (ww2(ii)*(2*pi/nxcirc(2))).^2];

        # 3: % 3 dimensional stimulus

        #     Cdiag = kron(cdiagvecs{3},(kron(cdiagvecs{2},cdiagvecs{1})));
        #     ii = (Cdiag/max(Cdiag))>1/condthresh; % indices to keep

        #     % compute vector of normalized frequencies squared
        #     [ww1,ww2,ww3] = ndgrid(wvecs{1},wvecs{2},wvecs{3});
        #     wwnrm = [(ww1(ii)*(2*pi/nxcirc(1))).mv ^2, ...
        #         (ww2(ii)*(2*pi/nxcirc(2))).^2, ....,
        #         (ww3(ii)*(2*pi/nxcirc(3))).^2];

        # otherwise
        #     error('compLSsuffstats_fourier.m : doesn''t yet handle %d dimensional filters\n',nd);
        }[switcher]

    try:
        [wwnrm, ii] = f(nd)
    except KeyError:
        print('\n\n Does not handle values of dimension', nd, 'yet')

    # Calculate stimulus sufficient stats in Fourier domain

    # if x.shape[0] == 1:

    # 	#originally this used the transpose operation (kronmulttrp) ! !!!might be a transpositional issue.
    # 	Bx = kron_ops.kronmult(Bffts,np.transpose(x)) # convert to Fourier domain
    # 	Bx = Bx[ii] # prune unneeded freqs

    # elif x.shape[0]>1: #Batched data. when the shape of x is 3 and dims is 2, for example.

    Bx = [kron_ops.kronmult(Bffts, np.transpose(batch)) for batch in x]
    Bx = [prune[ii] for prune in Bx]
    return Bx[0], wwnrm, Bffts[0], nxcirc
Пример #12
0
def extract_from_file(file='',varname='zeta',extraction_type='full',**kwargs):
	# Check the netCDF file for existence and if the variable is in it
	if not os.path.exists(file):
		raise IOError('File %s could not be located on the filesystem' %file)
	ncf = nc.Dataset(file,mode='r')
	
	if varname not in ncf.variables:
		raise IOError('File %s does not have a variable named %s' % (file, varname))
	
	# start getting data
	ncvar = ncf.variables[varname]
	dims = ncvar.dimensions
	ndims = len(dims)
	shape = ncvar.shape
#	print('var: %s, dims: %s, shape: %s' %(varname, str(dims), str(shape)))
	if not ndims == 3 and not ndims == 4:
		raise TypeError('ndims is neither 3 nor 4')
	if not dims[0] == 'ocean_time':
		raise TypeError('first dimension is not ocean_time')
	if not shape[0] == 1:
		raise TypeError('first dimension is not of length one')
	
	grid = load_grid.load_grid(file)
	coords = {}
	if ndims == 3:
		if dims[1] == 'eta_rho' and dims[2] == 'xi_rho':
#			print('G.lat %s' % str(grid['lat']))
#			print('G.lon %s' % str(grid['lon']))
			y2 = grid['lat'][:]
			x2 = grid['lon'][:]
			mask2 = grid['mask'][:]
		elif dims[1] == 'eta_u' and dims[2] == 'xi_u':
#			print('G.latu %s' % str(grid['lat_u']))
#			print('G.lonv %s' % str(grid['lon_u']))
			y2 = grid['latu'][:]
			x2 = grid['lonu'][:]
			mask2 = grid['masku'][:]
		elif dims[1] == 'eta_v' and dims[2] == 'xi_v':
#			print('G.latv %s' % str(grid['lat_v']))
#			print('G.lonv %s' % str(grid['lon_v']))
			y2 = grid['latv'][:]
			x2 = grid['lonv'][:]
			mask2 = grid['maskv'][:]
		else:
			raise TypeError('Unable to determine which gird to use')
		
		if extraction_type == 'full' or extraction_type == 'surface':
#			data = np.squeeze(ncvar[:])
#			data[mask2==0] = np.NaN
			data = np.ma.array(np.squeeze(ncvar[:]),mask=(mask2==0))
			
			coords['ym'] = y2
			coords['xm'] = x2
		
		if (extraction_type == 'profile' or extraction_type == 'profiles' or
		    extraction_type == 'point' or extraction_type == 'points') and \
		   (kwargs.has_key('y') and kwargs.has_key('x')):
			print('In profiles')
			xm = np.array(kwargs['x'])
			ym = np.array(kwargs['y'])
			if not (xm.ndim == ym.ndim) or not (xm.shape == ym.shape):
				if xm.size == 1:
					xm = np.repmat(xm,ym.shape)
				elif ym.size == 1:
					ym = np.repmat(ym,xm.shape)
#				else:
#					raise RuntimeError('The x and y chosen to extract a point or profile on this 2D variable are incompatible.')

			data2 = np.ma.array(np.squeeze(ncvar[:]),mask=(mask2==0))
#			data = utils.interp_2d(lat=y2,lon=x2,data=data2,lati=ym,loni=xm)
			mask = utils.interp_2d_xy(y=y2,x=x2,data=mask2,yi=ym,xi=xm)
			data = utils.interp_2d_xy(y=y2,x=x2,data=data2,yi=ym,xi=xm)
			data = np.ma.array(data,mask=(mask < 1))
			coords['ym'] = ym
			coords['xm'] = xm

	if ndims == 4:
		xm = []
		ym = []
		zm = []
		
		K = shape[1]
		J = shape[2]
		I = shape[3]
		
		lat = grid['lat'][:]
		lon = grid['lon'][:]
		
		if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
			y2 = lat
			x2 = lon
			mask2 = grid['mask'][:]
		
		elif dims[2] == 'eta_u' and dims[3] == 'xi_u':
			y2 = grid['latu'][:]
			x2 = grid['lonu'][:]
			mask2 = grid['masku'][:]
		
		elif dims[2] == 'eta_v' and dims[3] == 'xi_v':
			y2 = grid['latv'][:]
			x2 = grid['lonv'][:]
			mask2 = grid['maskv'][:]
		else:
			raise TypeError('Unable to determine which gird to use. dims[2] = %s, dims[3] = %s' % (dims[2], dims[3]))
		
		if dims[1] == 's_rho':
			cs = grid['cs'][:]
		elif dims[1] == 's_w':
			cs = grid['csw'][:]
		elif K==1:
			cs = 0
		else:
			raise TypeError('Unable to determine which cs to use. dim[1] = %s' % dim[1])
		
		data = []
		if extraction_type == 'full':
			# get zeta		
			try:
				if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
					zeta2 = ncf.variables['zeta'][:]
				else:
					zeta2 = utils.interp_2d_xy(y=lat,x=lon,data=ncf.variables['zeta'][:],yi=y2,xi=x2)
			except Exception, e:
				print(e)
				zeta2 = np.zeros((len(y2),len(x2)))
			zeta2[zeta2>1000] = 0
	
			# get H
			if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
				H2 = grid['H'][:]
			else:
				H2 = utils.interp_2d_xy(y=lat,x=lon,data=grid['H'][:],yi=y2,xi=x2)
			
			x3 = np.tile(x2.reshape(1, J, I),(K,1,1))
			y3 = np.tile(y2.reshape(1, J, I),(K,1,1))
			mask3 = np.tile(mask2.reshape(1, J, I),(K,1,1))
			zeta3 = np.tile(zeta2.reshape(1, J, I),(K,1,1))
			H3 = np.tile(H2.reshape(1, J, I),(K,1,1))
			cs3 = np.tile(cs.reshape(K,1,1),(1,J,I))
			z3 = zeta3 + cs3*(zeta3 + H3)
			
			zm = z3
			ym = y3
			xm = x3
			data = np.ma.array(np.squeeze(ncvar[:]),mask=(mask3==0))
		
		elif extraction_type == 'surface':
			# get zeta		
			try:
				if dims[2] == 'eta_rho' and dims[3] == 'xi_rho':
					zeta2 = ncf.variables['zeta'][:]
				else:
					zeta2 = utils.interp_2d_xy(y=lat,x=lon,data=ncf.variables['zeta'][:],yi=y2,xi=x2)
			except Exception, e:
				print(e)
				zeta2 = np.zeros((len(y2),len(x2)))
			zeta2[zeta2>1000] = 0
			data = np.ma.array(np.squeeze(ncvar[0,K-1,:,:]),mask=(mask2==0))
			zm = zeta2
			ym = y2
			xm = x2
Пример #13
0
def preprocessing_setup(data, analysis_settings):
    """
    Performs sanity checks on the input data and the algorithm parameter struct. Massages the data (i.e. drop outliers,
    zscore data, etc).

    **Arguments**:  
    - data: Input data matrix (total number of trials x 6 columns)  
    - analysis_settings: Struct with algorithm parameters  

    **Returns**:  
    - data: Input data matrix (if applicable, outlier free, zscored, category specific data only, etc)  
    - analysis_settings: Struct with algorithm parameters; some additional parameters are added to this struct as well  
    """

    print('********** START OF MESSAGES **********')

    # Checks if the data matrix has 6 columns
    number_of_columns = np.shape(data)[1]
    if number_of_columns != 6:
        raise ValueError(
            'Incorrect number of columns ({}) in the input matrix!'.format(
                number_of_columns))

    # Registering which column in the data matrix is carrying which piece of information
    if (not ('data_matrix_columns' in analysis_settings)) or (
            not analysis_settings['data_matrix_columns']):
        # Setting it to the default
        analysis_settings['data_matrix_columns'] = {}
        analysis_settings['data_matrix_columns']['subject_id'] = 0
        analysis_settings['data_matrix_columns']['trials'] = 1
        analysis_settings['data_matrix_columns']['category'] = 2
        analysis_settings['data_matrix_columns']['predictor_var'] = 3
        analysis_settings['data_matrix_columns']['dependent_var'] = 4
        analysis_settings['data_matrix_columns']['net_effect_clusters'] = 5

    subject_id_column = analysis_settings['data_matrix_columns']['subject_id']
    trials_column = analysis_settings['data_matrix_columns']['trials']
    category_column = analysis_settings['data_matrix_columns']['category']
    predictor_var_column = analysis_settings['data_matrix_columns'][
        'predictor_var']
    dependent_var_column = analysis_settings['data_matrix_columns'][
        'dependent_var']
    net_effect_clusters_column = analysis_settings['data_matrix_columns'][
        'net_effect_clusters']

    # Checks if the em iterations is specified; if not specified then it is set to a default of 20
    if (not ('em_iterations' in analysis_settings)) or (
            analysis_settings['em_iterations'] <= 0):
        analysis_settings['em_iterations'] = 20
        print('Missing number of iterations! It is set to a default of {}'.
              format(analysis_settings['em_iterations']))

    # Checks if the no. of particles is specified; if not specified then it is set to a default of 1000
    if (not ('particles'
             in analysis_settings)) or (analysis_settings['particles'] <= 0):
        analysis_settings['particles'] = 100000
        print(
            'Missing number of particles! It is set to a default of {}'.format(
                analysis_settings['particles']))

    # Checks if the family of curves is specified; if not then set to 'horz_indpnt' (Refer to family of curves)
    if (not ('curve_type'
             in analysis_settings)) or (not analysis_settings['curve_type']):
        analysis_settings['curve_type'] = 'horz_indpnt'
        print('Missing family of curves! It is set to a default of {}'.format(
            analysis_settings['curve_type']))

    # Checks if the family of curves exist by fetching the number of curve parameters. This is just a sanity check
    if not isinstance(
            family_of_curves(analysis_settings['curve_type'], 'get_nParams'),
            int):
        raise ValueError(
            '{} - Does not exist! Check family_of_curves.m script'.format(
                analysis_settings['curve_type']))

    # Checks if the distribution is specified;
    # If not specified and if the dependent variable is binary it's set to 'bernoulli'; otherwise set to to 'normal'
    if (not ('distribution'
             in analysis_settings)) or (not analysis_settings['distribution']):
        if len(np.unique(data[:, dependent_var_column])) == 2:
            analysis_settings['distribution'] = 'bernoulli'
        else:
            analysis_settings['distribution'] = 'normal'
        print(
            'Missing distribution! based on the dependent variable it is set to {}'
            .format(analysis_settings['distribution']))

    # Checks if the distribution specific parameters exist
    if (not ('dist_specific_params' in analysis_settings)) or (
            not analysis_settings['dist_specific_params']):
        if analysis_settings['distribution'] == 'bernoulli':

            # For a Bernoulli dist there are no parameters so it is empty. We still need the struct to exist
            analysis_settings['dist_specific_params'] = {}

        elif analysis_settings['distribution'] == 'normal':

            # For normal distribution the additional parameter is sigma. We pass in sigma here.
            analysis_settings['dist_specific_params'] = {}
            analysis_settings['dist_specific_params'][
                'sigma'] = 1  # Default is 1
            print('Missing sigma for normal distribution! It is set to {}'.
                  format(analysis_settings['dist_specific_params']['sigma']))

    # Checks if normal distribution specific parameter is valid i.e. sigma > 0
    if (analysis_settings['distribution'] == 'normal') and (
            analysis_settings['dist_specific_params']['sigma'] <= 0):
        raise ValueError(
            'Normal distribution sigma will need to > 0! sigma = {}'.format(
                analysis_settings['dist_specific_params']['sigma']))

    # Checks if beta_0 is specified; if not specified then it is set to a default of 0
    if not ('beta_0' in analysis_settings):
        analysis_settings['beta_0'] = 0
        print(
            'Missing initial setting for beta_0! It is set to a default of {}'.
            format(analysis_settings['beta_0']))

    # Checks if beta_1 is specified; if not specified then it is set to a default of 1
    if not ('beta_1' in analysis_settings):
        analysis_settings['beta_1'] = 1
        print(
            'Missing initial setting for beta_1! It is set to a default of {}'.
            format(analysis_settings['beta_1']))

    # Checks if tau is specified; if not specified then it is set to a default of 0.05
    if not ('tau' in analysis_settings):
        analysis_settings['tau'] = 0.05
        print('Missing initial setting for tau! It is set to a default of {}'.
              format(analysis_settings['tau']))

    # Checks if this is a bootstrap run; if not specified then it is set to a default of false
    if not ('bootstrap' in analysis_settings):
        analysis_settings['bootstrap'] = False
        print(
            'Missing initial setting for beta_1! It is set to a default of {}'.
            format(analysis_settings['bootstrap']))

    # Checks if bootstrap flag is boolean
    if not (type(analysis_settings['bootstrap']) == bool):
        raise ValueError(
            'analysis_settings.bootstrap field will need to be boolean!')

    # Checks if this is a scramble run; if not specified then it is set to a default of false
    if not ('scramble' in analysis_settings):
        analysis_settings['scramble'] = False

    # Checks if scramble flag is boolean
    if not (type(analysis_settings['scramble']) == bool):
        raise ValueError(
            'analysis_settings.scramble field will need to be boolean!')

    # Errors if both bootstrap and scramble flags exist
    if analysis_settings['scramble'] and analysis_settings['bootstrap']:
        raise ValueError(
            'Cannot run both scramble AND bootstrap analyses at the same time! Set any one flag to be false'
        )

    # Builds a bootstrap data matrix from the original data matrix
    if analysis_settings['bootstrap'] and not (analysis_settings['scramble']):

        # We need a bootstrap sample number
        if (not ('bootstrap_run' in analysis_settings)) or (
                not analysis_settings['bootstrap_run']):
            raise ValueError(
                'Missing bootstrap sample number! set analysis_settings.bootstrap_run to a valid sample number'
            )

        bootstrap_data = []
        new_cluster_count = 1
        new_subject_count = 1

        # Get the number of subjects from the data matrix
        number_of_subjects = len(np.unique(data[:, subject_id_column]))

        # Randomly sample with replacement the number of subjects thus generating our bootstrap sample
        subj_num_with_replacement = random.choices(
            np.arange(number_of_subjects), k=number_of_subjects)

        # For each subject in our bootstrap sample gather all relevant information
        for i in range(len(subj_num_with_replacement)):
            subj_idx = np.where(
                data[:, subject_id_column] == subj_num_with_replacement[i])

            # Recreate a new net effect cluster since this will need to be unique in the data matrix
            # (by repeatedly sampling subjects we could be repeating the net effect clusters)
            cluster_vector = data[subj_idx, net_effect_clusters_column]
            cluster_numbers = np.unique[cluster_vector]
            for j in range(len(cluster_numbers)):
                target_idx = np.where(
                    data[subj_idx,
                         net_effect_clusters_column] == cluster_numbers[j])
                cluster_vector[target_idx] = new_cluster_count
                new_cluster_count += 1

            # Recreate a new subject id
            # (by repeatedly sampling subjects we could be repeating the subject id's)
            # Gather all information into a bootstrap_data matrix
            bootstrap_data.append(
                np.concatenate(
                    np.repmat(new_subject_count, len(subj_idx), 1),
                    data[subj_idx,
                         trials_column:dependent_var_column], cluster_vector))
            new_subject_count += 1

        # Perform some sanity checks to ensure that the bootstrap_data matrix is similar to the actual data matrix
        if not np.all(np.shape(bootstrap_data) == np.shape(data)):
            raise ValueError(
                'Size of bootstrap dataset NOT the same as original data!')
        if not (len(np.unique(data[:, net_effect_clusters_column])) == len(
                np.unique(bootstrap_data[:, net_effect_clusters_column]))):
            raise ValueError(
                'The number of clusters are not the same in the original and bootstrap sample!'
            )
        if not np.array_equal(data[:, subject_id_column],
                              bootstrap_data[:, subject_id_column]):
            raise ValueError(
                'The ordering of subjects are not the same in the original and bootstrap sample!'
            )

        # Store away the bootstrap sample subject information for future reference
        analysis_settings['bootstrap_run_subj_id'] = subj_num_with_replacement
        data = bootstrap_data

    # Checks if analysis will be performed for a specific category; if not then set to [] i.e. NOT category specific
    if not ('category' in analysis_settings):
        analysis_settings.category = []
        print(
            'Missing category specific analyses information! We are going to ignore the category dimension i.e. all '
            'trials from all categories will be analysed')

        # If this analysis is to be performed for a specific category then filters out data from other irrelevant categories
    if len(analysis_settings['category']) > 0:
        target_cat_idx = []
        data_cat = np.unique(data[:, category_column])
        for c in range(len(analysis_settings['category'])):
            cat_exist = np.where(
                data_cat == analysis_settings['category'][c])[0]
            if cat_exist.size == 0:
                raise ValueError(
                    'Category does not exist! You have set analysis_settings.category[{}]={}'
                    .format(c, analysis_settings['category'][c]))
            target_cat_idx = np.concatenate(
                target_cat_idx,
                np.where(data[:, category_column] ==
                         analysis_settings['category'][c])[0])
        data = data[target_cat_idx, :]

    # Checks if outliers (i.e. data trials) will need to dropped; if not specified then set to 'DO NOT DROP OUTLIERS'
    if not ('drop_outliers' in analysis_settings):
        analysis_settings['drop_outliers'] = 3
        print(
            'Missing drop_outliers specific information! We are dropping outliers that are {} standard deviations away from the group mean'
            .format(analysis_settings['drop_outliers']))

    # If this analysis requires the outliers dropped, then drops the data trials within std devs from the GROUP MEAN
    if analysis_settings['drop_outliers'] > 0:
        # NaN's do not qualify as outliers so we filter them out and add them at the end of this step
        nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))

        # NaN free data
        nan_free_data = data[nan_free_idx, :]
        std_dev_predictor_var = np.std(
            nan_free_data[:, predictor_var_column],
            ddof=1) * analysis_settings['drop_outliers']
        mean_predictor_var = np.mean(nan_free_data[:, predictor_var_column])
        predictor_var_idx = (nan_free_data[:, predictor_var_column] >
                             (mean_predictor_var - std_dev_predictor_var)) & (
                                 nan_free_data[:, predictor_var_column] <
                                 (mean_predictor_var + std_dev_predictor_var))
        print(
            '{} trials are dropped since they are regarded as outliers'.format(
                np.shape(nan_free_data)[subject_id_column] -
                np.sum(predictor_var_idx)))
        nan_free_data_outlier_dropped = nan_free_data[predictor_var_idx, :]

        # NaN's trials
        nan_data = data[np.logical_not(nan_free_idx), :]

        # Combine the NaN data with the outlier free data
        data = np.concatenate(
            nan_free_data_outlier_dropped, nan_data
        ) if np.shape(nan_data)[0] > 0 else nan_free_data_outlier_dropped

    # Following the 'filter by category' and 'drop outliers', if applicable, we check if the data matrix is empty
    number_of_trials = np.shape(data)[subject_id_column]
    if number_of_trials <= 0:
        raise ValueError('No input data!')

    # Checks if we need to zscore predictor var within subjects, if not specified then it is set to default of FALSE
    if not ('zscore_within_subjects' in analysis_settings):
        analysis_settings['zscore_within_subjects'] = 0
        print(
            'Missing zscore_within_subjects information! We are NOT zscoring within subjects'
        )

    # Verifies if zscore within subjects is boolean
    if not (type(analysis_settings['zscore_within_subjects']) == bool):
        raise ValueError(
            'zscore_within_subjects field will need to be boolean!')

    # Zscore the predictor variable within each subject
    if analysis_settings['zscore_within_subjects']:
        # NaN's do not qualify to be zscored
        nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))
        # NaN free data
        nan_free_data = data[nan_free_idx, :]
        # Get the list of subject id's (we use this cell array in zscoring the data within each subject, if applicable)
        subject_id_list = np.unique(nan_free_data[:, subject_id_column])
        # We get the number of subjects
        number_of_subjects = len(subject_id_list)
        if number_of_subjects <= 0:
            raise ValueError('Not valid number of subjects!')
        for s in range(number_of_subjects):
            subject_idx = np.where(
                nan_free_data[:, subject_id_column] == subject_id_list[s])[0]
            nan_free_data[subject_idx, predictor_var_column] = stats.zscore(
                nan_free_data[subject_idx, predictor_var_column], ddof=1)
        print('Predictor variables within each subject are zscored!')
        # NaN's trials
        nan_data = data[np.logical_not(nan_free_idx), :]
        # Combine the NaN data with the outlier free data
        data = np.concatenate(
            nan_free_data,
            nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data

    # Checks if resolution is specified, if not specified then set to default of 4. This translates to 1e-4 = 0.0001
    if (not ('resolution'
             in analysis_settings)) or (analysis_settings['resolution'] <= 0):
        analysis_settings['resolution'] = 4
        print('Missing resolution! It is set to a default of %d'.format(
            analysis_settings['resolution']))

    # if we have normally distributed data, we want to z-score the dependent variable
    if analysis_settings['distribution'] == 'normal':
        data[:,
             dependent_var_column] = stats.zscore(data[:,
                                                       dependent_var_column],
                                                  ddof=1)

    # We scale the predictor var to be between 0 and 1 and round it to 4 digits
    nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))
    nan_free_data = data[nan_free_idx, :]
    nan_free_data[:, predictor_var_column] = np.round(
        scale_data(nan_free_data[:, predictor_var_column], 0, 1),
        analysis_settings['resolution'])
    nan_data = data[np.logical_not(nan_free_idx), :]
    data = np.concatenate(
        nan_free_data,
        nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data

    # Scrambling the data matrix
    if analysis_settings['scramble']:
        if (not ('scramble_run' in analysis_settings)) or (
                not analysis_settings['scramble_run']):
            raise ValueError(
                'Missing scramble sample number! set analysis_settings.scramble_run to a valid sample number'
            )
        if (not ('scramble_style' in analysis_settings)) or (
                not analysis_settings['scramble_style']):
            analysis_settings[
                'scramble_style'] = 'within_subjects_within_categories'  # most conservative of all scramble techniques
            print('Missing scramble style! It is set a default of {}'.format(
                analysis_settings['scramble_style']))

        # We get the list of subject id's
        subject_id_list = np.unique(data[:, subject_id_column])
        # We get the number of subjects in this analysis
        number_of_subjects = len(subject_id_list)
        if number_of_subjects <= 0:
            raise ValueError('Not valid number of subjects!')

        if analysis_settings[
                'scramble_style'] == 'within_subjects_within_categories':
            # Here scramble all DVs WHILE respecting the net effect boundaries, subject groupings and category groupings
            categories = np.unique(data[:, category_column])
            for s in range(number_of_subjects):
                for c in range(len(categories)):
                    subject_category_idx = np.where(
                        (data[:, subject_id_column] == subject_id_list[s])
                        & (data[:, category_column] == categories[c]))[0]
                    if len(subject_category_idx) > 1:
                        data[
                            subject_category_idx,
                            dependent_var_column] = scramble_dependent_variable(
                                data[subject_category_idx,
                                     dependent_var_column],
                                data[subject_category_idx,
                                     net_effect_clusters_column])

        elif analysis_settings[
                'scramble_style'] == 'within_subjects_across_categories':
            # Here we scramble all dependent variables WHILE respecting the net effect boundaries and subject groupings
            for s in range(number_of_subjects):
                subject_idx = np.where(
                    data[:, subject_id_column] == subject_id_list[s])[0]
                if len(subject_idx) > 1:
                    data[subject_idx,
                         dependent_var_column] = scramble_dependent_variable(
                             data[subject_idx, dependent_var_column],
                             data[subject_idx, net_effect_clusters_column])

        elif analysis_settings[
                'scramble_style'] == 'across_subjects_across_categories':
            # Here we scramble all dependent variables WHILE respecting the net effect boundaries
            all_idx = np.arange(np.shape(data)[0])
            if len(all_idx) > 1:
                data[all_idx,
                     dependent_var_column] = scramble_dependent_variable(
                         data[all_idx, dependent_var_column],
                         data[all_idx, net_effect_clusters_column])

        else:
            raise ValueError(
                'Invalid analysis_settings.scramble_style={}'.format(
                    analysis_settings['scramble_style']))

    # Our data matrix looks like data = [subject id, item, category, predictor var, dependent var, net effect cluster]
    # We verify if the subject id and dependent var columns are unique for the net effect clusters
    # Below is a example of a valid data matrix (note dependent variable is unique within net effect cluster 111)
    # data(1, :) = [24, 1, 1, 0.3333, 0, 111]
    # data(2, :) = [24, 2, 2, 0.2222, 0, 111]
    # data(3, :) = [24, 3, 1, 0.4444, 0, 111]
    # Below is a example of an invalid data matrix (note dependent variable is not unique within net effect cluster 111)
    # data(1, :) = [24, 1, 1, 0.3333, 0, 111]
    # data(2, :) = [24, 2, 2, 0.2222, 1, 111]
    # data(3, :) = [24, 3, 1, 0.4444, 0, 111]

    # Fetching the net effect clusters
    net_effect_clusters = np.unique(data[:, net_effect_clusters_column])
    analysis_settings['net_effect_clusters'] = net_effect_clusters

    # If net effect clusters exist verify if the Subject Id and dependent variable are unique for those clusters
    if len(net_effect_clusters) != np.shape(data)[0]:
        for i in range(len(net_effect_clusters)):
            cluster_idx = np.where(
                data[:,
                     net_effect_clusters_column] == net_effect_clusters[i])[0]
            if len(
                    np.shape(
                        np.unique(
                            data[cluster_idx,
                                 [subject_id_column, dependent_var_column]],
                            axis=0))) != 1:
                raise ValueError(
                    'Subject Id and/or dependent variable not unique for net effect cluster {}! Check '
                    'the data matrix'.format(net_effect_clusters[i]))
    else:
        # If net effect clusters DO NOT exist then we treat each row as a net effect cluster by itself
        print(
            'Each row will be treated separately. We will NOT be computing the net effect of any rows'
        )

    # We create an analysis id unique to this analysis
    if (not ('analysis_id'
             in analysis_settings)) or (not analysis_settings['analysis_id']):
        time = datetime.datetime.now()
        analysis_settings['analysis_id'] = '{}-{}-{}-{}-{}'.format(
            time.month, time.day, time.hour, time.minute, time.second)

    # We create a results directory if no specific target directory is mentioned
    if (not ('target_dir'
             in analysis_settings)) or (not analysis_settings['target_dir']):
        results_dir = os.path.join(os.getcwd(), 'results')
        if not os.path.isdir(results_dir):
            os.mkdir(results_dir)
        analysis_settings['target_dir'] = results_dir

    # target_directory = 'results/analysis_id'
    analysis_settings['target_dir'] = os.path.join(
        analysis_settings['target_dir'], analysis_settings['analysis_id'])
    if not os.path.isdir(analysis_settings['target_dir']):
        os.mkdir(analysis_settings['target_dir'])

    # Due to memory constraints we perform two chunking tricks

    # Chunking trick I
    # In the curve fitting algorithm we need to compute the p(current iteration curves | previous
    # iteration curves). This matrix is huge when the number of particles (curves) is large, say 100,000. Even with a
    # 8 Gb RAM, dedicated to Matlab, we still get a out of memory errors. To avoid this problem we chunk the matrix
    # into smaller, more manageable matrices. Setting the chunk size to be particles x 0.05 -> 100,000 x 0.05 = 5000,
    # translates to p(current iteration curves(5000 curves at a time) | previous iteration curves).
    analysis_settings['wgt_chunks'] = analysis_settings['particles'] * 0.05
    # If the chunk size is less then 5000 we set it be the number of particles itself
    if analysis_settings['wgt_chunks'] < 5000:
        analysis_settings['wgt_chunks'] = analysis_settings['particles']

    # Chunking trick II
    if not ('particle_chunks' in analysis_settings):
        analysis_settings['particle_chunks'] = 2
        print('Missing particle chunks! It is set to a default of {}'.format(
            analysis_settings['particle_chunks']))

    # Depending on the number of particle chunks we get start, end points and the number of particles within each chunk.
    # For instance 1000 particles divided into 4 chunks will look like,
    # | 0   | 250 | 250
    # | 250	| 500 | 250
    # | 500 | 750 | 250
    # | 750 | 1000| 250
    dummy = np.arange(
        0, analysis_settings['particles'],
        analysis_settings['particles'] / analysis_settings['particle_chunks'])
    analysis_settings['ptl_chunk_idx'] = np.stack(
        (dummy, dummy +
         analysis_settings['particles'] / analysis_settings['particle_chunks'],
         np.full(
             np.shape(dummy), analysis_settings['particles'] /
             analysis_settings['particle_chunks'])),
        axis=1)

    # Storing analysis relevant information into the analysis_settings struct
    # We get the list of subject id's
    subject_id_list = np.unique(data[:, subject_id_column])

    # We get the number of subjects in this analysis
    analysis_settings['nSubjs'] = len(subject_id_list)
    if analysis_settings['nSubjs'] <= 0:
        raise ValueError('Not valid number of subjects!')

    print('********** END OF MESSAGES **********')
    return data, analysis_settings