예제 #1
0
    def __init__(self,h,thid,ra,dec,zqso,plate,mjd,fid):
	qso.__init__(self,thid,ra,dec,zqso,plate,mjd,fid)

	ll = sp.array(h["loglam"][:])
	fl = sp.array(h["coadd"][:])
        iv = sp.array(h["ivar"][:])*(sp.array(h["and_mask"][:])==0)

        w=(ll>forest.lmin) & (ll<forest.lmax) & (ll-sp.log10(1+self.zqso)>forest.lmin_rest) & (ll-sp.log10(1+self.zqso)<forest.lmax_rest)
        w = w & (iv>0)
        if w.sum()==0:return
        
        ll=ll[w]
        fl=fl[w]
        iv=iv[w]

        ## rebin
        bins = ((ll-forest.lmin)/forest.dll+0.5).astype(int)
        civ=sp.bincount(bins,weights=iv)
        w=civ>0
        civ=civ[w]

        c=sp.bincount(bins,weights=ll*iv)
        c=c[w]
        ll = c/civ
        c=sp.bincount(bins,weights=fl*iv)
        c=c[w]
        fl=c/civ
        iv = civ

        self.T_dla = None
        self.ll = ll
        self.fl = fl
        self.iv = iv
예제 #2
0
def rebin_diff_noise(dll, ll, diff):

    crebin = 3
    if (diff.size < crebin):
        print("Warning: diff.size too small for rebin")
        return diff
    dll2 = crebin * dll

    # rebin not mixing pixels separated by masks
    bin2 = sp.floor((ll - ll.min()) / dll2 + 0.5).astype(int)

    # rebin regardless of intervening masks
    # nmax = diff.size//crebin
    # bin2 = sp.zeros(diff.size)
    # for n in range (1,nmax +1):
    #     bin2[n*crebin:] += sp.ones(diff.size-n*crebin)

    cdiff2 = sp.bincount(bin2.astype(int), weights=diff)
    civ2 = sp.bincount(bin2.astype(int))
    w = (civ2 > 0)
    if (len(civ2) == 0):
        print("Error: diff size = 0 ", diff)
    diff2 = cdiff2[w] / civ2[w] * sp.sqrt(civ2[w])
    diffout = sp.zeros(diff.size)
    nmax = len(diff) // len(diff2)
    for n in range(nmax + 1):
        lengthmax = min(len(diff), (n + 1) * len(diff2))
        diffout[n * len(diff2):lengthmax] = diff2[:lengthmax - n * len(diff2)]
        sp.random.shuffle(diff2)

    return diffout
예제 #3
0
파일: cf.py 프로젝트: ngbusca/pyLyA
def cf(data):
    xi = sp.zeros(np * nt)
    we = sp.zeros(np * nt)

    for i, d1 in enumerate(data):
        wd1 = d1.de * d1.we
        for d2 in d1.neighs:

            wd2 = d2.de * d2.we
            ang = d1 ^ d2

            rp = abs(d1.r_comov - d2.r_comov[:, None]) * sp.cos(ang / 2)
            rt = (d1.r_comov + d2.r_comov[:, None]) * sp.sin(ang / 2)
            wd12 = wd1 * wd2[:, None]
            w12 = d1.we * d2.we[:, None]

            w = (rp < rp_max) & (rt < rt_max)
            rp = rp[w]
            rt = rt[w]
            wd12 = wd12[w]
            w12 = w12[w]
            bp = (rp / rp_max * np).astype(int)
            bt = (rt / rt_max * nt).astype(int)
            bins = bt + nt * bp
            c = sp.bincount(bins, weights=wd12)
            xi[:len(c)] += c
            c = sp.bincount(bins, weights=w12)
            we[:len(c)] += c

    w = we > 0
    xi[w] /= we[w]
    return we, xi
예제 #4
0
def fast_co(z1, r1, w1, z2, r2, w2, ang):

    rp = (r1 - r2) * sp.cos(ang / 2.)
    if not x_correlation or type_corr in ['DR', 'RD']:
        rp = sp.absolute(rp)
    rt = (r1 + r2) * sp.sin(ang / 2.)
    z = (z1 + z2) / 2.
    w12 = w1 * w2

    w = (rp >= rp_min) & (rp < rp_max) & (rt < rt_max) & (w12 > 0.)
    rp = rp[w]
    rt = rt[w]
    z = z[w]
    w12 = w12[w]

    bp = sp.floor((rp - rp_min) / (rp_max - rp_min) * np).astype(int)
    bt = (rt / rt_max * nt).astype(int)
    bins = bt + nt * bp

    cw = sp.bincount(bins, weights=w12)
    crp = sp.bincount(bins, weights=rp * w12)
    crt = sp.bincount(bins, weights=rt * w12)
    cz = sp.bincount(bins, weights=z * w12)
    cnb = sp.bincount(bins)

    return cw, crp, crt, cz, cnb
예제 #5
0
    def __add__(self, d):

        if not hasattr(self, 'll') or not hasattr(d, 'll'):
            return self

        ll = sp.append(self.ll, d.ll)
        fl = sp.append(self.fl, d.fl)
        iv = sp.append(self.iv, d.iv)
        if self.mmef is not None:
            mmef = sp.append(self.mmef, d.mmef)

        bins = sp.floor((ll - forest.lmin) / forest.dll + 0.5).astype(int)
        cll = forest.lmin + sp.arange(bins.max() + 1) * forest.dll
        cfl = sp.zeros(bins.max() + 1)
        civ = sp.zeros(bins.max() + 1)
        if mmef is not None:
            cmmef = sp.zeros(bins.max() + 1)
        ccfl = sp.bincount(bins, weights=iv * fl)
        cciv = sp.bincount(bins, weights=iv)
        if mmef is not None:
            ccmmef = sp.bincount(bins, weights=iv * mmef)
        cfl[:len(ccfl)] += ccfl
        civ[:len(cciv)] += cciv
        if mmef is not None:
            cmmef[:len(ccmmef)] += ccmmef
        w = (civ > 0.)

        self.ll = cll[w]
        self.fl = cfl[w] / civ[w]
        self.iv = civ[w]
        if mmef is not None:
            self.mmef = cmmef[w]

        return self
예제 #6
0
def stack(data, delta=False):
    nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1
    ll = forest.lmin + sp.arange(nstack) * forest.dll
    st = sp.zeros(nstack)
    wst = sp.zeros(nstack)
    for p in data:
        for d in data[p]:
            bins = ((d.ll - forest.lmin) / forest.dll + 0.5).astype(int)
            var_lss = forest.var_lss(d.ll)
            eta = forest.eta(d.ll)
            if delta:
                we = d.we
            else:
                iv = d.iv / eta
                we = iv * d.co**2 / (iv * d.co**2 * var_lss + 1)
            if delta:
                de = d.de
            else:
                de = d.fl / d.co
            c = sp.bincount(bins, weights=de * we)
            st[:len(c)] += c
            c = sp.bincount(bins, weights=we)
            wst[:len(c)] += c

    w = wst > 0
    st[w] /= wst[w]
    return ll, st
예제 #7
0
    def find_neighbor_throats(self,pores,mode='union',flatten=True):
        r"""
        Returns a list of throats neighboring the given pore(s)

        Parameters
        ----------
        pores : array_like
            Indices of pores whose neighbors are sought
        flatten : boolean, optional
            If flatten is True (default) a 1D array of unique throat ID numbers
            is returned. If flatten is False the returned array contains arrays
            of neighboring throat ID numbers for each input pore, in the order
            they were sent.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are:

            * 'union' : All neighbors of the input pores

            * 'intersection' : Only neighbors shared by all input pores

            * 'not_intersection' : Only neighbors not shared by any input pores

        Returns
        -------
        neighborTs : 1D array (if flatten is True) or ndarray of arrays (if
            flatten if False)

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_throats(pores=[0,1])
        array([0, 1, 2, 3, 4, 5])
        >>> pn.find_neighbor_throats(pores=[0,1],flatten=False)
        array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object)
        """
        #Test for existence of incidence matrix
        try:
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        except:
            temp = self.create_incidence_matrix(sprsfmt='lil')
            self._incidence_matrix['lil'] = temp
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        if [sp.asarray(x) for x in neighborTs if x] == []:
            return sp.array([],ndmin=1)
        if flatten:
            #All the empty lists must be removed to maintain data type after hstack (numpy bug?)
            neighborTs = [sp.asarray(x) for x in neighborTs if x]
            neighborTs = sp.hstack(neighborTs)
            #Remove references to input pores and duplicates
            if mode == 'not_intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)==1)[0])
            elif mode == 'union':
                neighborTs = sp.unique(neighborTs)
            elif mode == 'intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)>1)[0])
        else:
            for i in range(0,sp.size(pores)):
                neighborTs[i] = sp.array(neighborTs[i])
        return sp.array(neighborTs,ndmin=1)
예제 #8
0
파일: prep_del.py 프로젝트: joselotl/picca
def stack(data, delta=False):
    nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1
    ll = forest.lmin + sp.arange(nstack) * forest.dll
    st = sp.zeros(nstack)
    wst = sp.zeros(nstack)
    for p in sorted(list(data.keys())):
        for d in data[p]:
            if delta:
                de = d.de
                we = d.we
            else:
                de = d.fl / d.co
                var_lss = forest.var_lss(d.ll)
                eta = forest.eta(d.ll)
                fudge = forest.fudge(d.ll)
                var = 1. / d.iv / d.co**2
                we = 1. / variance(var, eta, var_lss, fudge)

            bins = ((d.ll - forest.lmin) / forest.dll + 0.5).astype(int)
            c = sp.bincount(bins, weights=de * we)
            st[:len(c)] += c
            c = sp.bincount(bins, weights=we)
            wst[:len(c)] += c

    w = wst > 0
    st[w] /= wst[w]
    return ll, st, wst
예제 #9
0
    def find_neighbor_throats(self,pnums,flatten=True,mode='union'):
        r"""
        Returns a list of throats neighboring the given pore(s)

        Parameters
        ----------
        pnums : array_like
            Indices of pores whose neighbors are sought
        flatten : boolean, optional
            If flatten is True (default) a 1D array of unique throat ID numbers
            is returned. If flatten is False the returned array contains arrays
            of neighboring throat ID numbers for each input pore, in the order
            they were sent.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are: 
            
            * 'union' : All neighbors of the input pores

            * 'intersection' : Only neighbors shared by all input pores 
            
            * 'not_intersection' : Only neighbors not shared by any input pores

        Returns
        -------
        neighborTs : 1D array (if flatten is True) or ndarray of arrays (if
            flatten if False)

        Examples
        --------
        >>> pn = OpenPNM.Network.Cubic(name='doc_test').generate(divisions=[5,5,5],lattice_spacing=[1])
        >>> pn.find_neighbor_throats(pnums=[0,1])
        array([0, 1, 2, 3, 4, 5])
        >>> pn.find_neighbor_throats(pnums=[0,1],flatten=False)
        array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object)
        """
        #Test for existance of incidence matrix
        try:
            neighborTs = self.incidence_matrix['lil']['connections'].rows[[pnums]]
        except:
            self._logger.info('Creating incidence matrix, please wait')
            self.create_incidence_matrix()
            neighborTs = self.incidence_matrix['lil']['connections'].rows[[pnums]]
        if flatten:
            #All the empty lists must be removed to maintain data type after hstack (numpy bug?)
            neighborTs = [sp.asarray(x) for x in neighborTs if x]
            neighborTs = sp.hstack(neighborTs)
            #Remove references to input pores and duplicates
            if mode == 'not_intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)==1)[0])
            elif mode == 'union':
                neighborTs = sp.unique(neighborTs)
            elif mode == 'intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs)>1)[0])
        else:
            for i in range(0,sp.size(pnums)):
                neighborTs[i] = sp.array(neighborTs[i])
        return sp.array(neighborTs,ndmin=1)
예제 #10
0
def worker_quality(predictions, num_classes):
    predictions = sp.atleast_2d(predictions)
    num_workers, num_objects = predictions.shape

    error_rates = sp.zeros((num_workers, num_classes, num_classes))
    diy, diz = sp.diag_indices(num_classes)
    error_rates[:, diy, diz] = 1

    while True:
        # E step
        new_predictions = sp.zeros((num_objects, num_classes))
        for i in xrange(num_objects):
            individual_predictions = predictions[:, i]
            individual_error_rates = error_rates[range(num_workers), individual_predictions, individual_predictions]
            new_predictions[i, :] = sp.bincount(individual_predictions, individual_error_rates, minlength=num_classes)

        correct_labels = sp.argmax(new_predictions, axis=1)
        count_per_label = sp.bincount(correct_labels)

        # M step
        new_error_rates = sp.zeros((num_workers, num_classes, num_classes))
        for i, label in enumerate(correct_labels):
            new_error_rates[range(num_workers), label, predictions[:, i]] += 1

        for i in xrange(num_classes):
            new_error_rates[:, :, i] /= count_per_label

        diff_error_rates = sp.absolute(new_error_rates - error_rates)
        error_rates = new_error_rates

        if sp.amax(diff_error_rates) < 0.001:
            break


    # calculate the cost of each worker
    class_priors = sp.bincount(correct_labels, minlength=num_classes) / float(num_objects)
    costs = []
    for k in xrange(num_workers):
        worker_class_priors = sp.dot(sp.atleast_2d(class_priors), error_rates[k])[0] + 0.0000001

        cost = 0
        for j in xrange(num_classes):
            soft_label = error_rates[k, :, j] * class_priors / worker_class_priors[j]

            soft_label_cost = 0.0
            for i in xrange(num_classes):
                soft_label_cost += sp.sum(soft_label[i] * soft_label)
            soft_label_cost -= sp.sum(soft_label ** 2) # subtract the diagonal entries (those costs = 0)
            cost += soft_label_cost * worker_class_priors[j]

        costs.append(cost)

    return error_rates, correct_labels, costs
예제 #11
0
def exp_diff(file, ll):

    nexp_per_col = file[0].read_header()['NEXP'] // 2
    fltotodd = sp.zeros(ll.size)
    ivtotodd = sp.zeros(ll.size)
    fltoteven = sp.zeros(ll.size)
    ivtoteven = sp.zeros(ll.size)

    if (nexp_per_col) < 2:
        print("DBG : not enough exposures for diff")

    for iexp in range(nexp_per_col):
        for icol in range(2):
            llexp = file[4 + iexp + icol * nexp_per_col]["loglam"][:]
            flexp = file[4 + iexp + icol * nexp_per_col]["flux"][:]
            ivexp = file[4 + iexp + icol * nexp_per_col]["ivar"][:]
            mask = file[4 + iexp + icol * nexp_per_col]["mask"][:]
            bins = sp.searchsorted(ll, llexp)

            # exclude masks 25 (COMBINEREJ), 23 (BRIGHTSKY)?
            if iexp % 2 == 1:
                civodd = sp.bincount(bins, weights=ivexp * (mask & 2**25 == 0))
                cflodd = sp.bincount(bins,
                                     weights=ivexp * flexp *
                                     (mask & 2**25 == 0))
                fltotodd[:civodd.size - 1] += cflodd[:-1]
                ivtotodd[:civodd.size - 1] += civodd[:-1]
            else:
                civeven = sp.bincount(bins,
                                      weights=ivexp * (mask & 2**25 == 0))
                cfleven = sp.bincount(bins,
                                      weights=ivexp * flexp *
                                      (mask & 2**25 == 0))
                fltoteven[:civeven.size - 1] += cfleven[:-1]
                ivtoteven[:civeven.size - 1] += civeven[:-1]

    w = ivtotodd > 0
    fltotodd[w] /= ivtotodd[w]
    w = ivtoteven > 0
    fltoteven[w] /= ivtoteven[w]

    alpha = 1
    if (nexp_per_col % 2 == 1):
        n_even = (nexp_per_col - 1) // 2
        alpha = sp.sqrt(4. * n_even * (n_even + 1)) / nexp_per_col
    diff = 0.5 * (fltoteven -
                  fltotodd) * alpha  ### CHECK THE * alpha (Nathalie)

    return diff
예제 #12
0
def region_size(im):
    r"""
    Replace each voxel with size of region to which it belongs

    Parameters
    ----------
    im : ND-array
        Either a boolean image wtih ``True`` indicating the features of
        interest, in which case ``scipy.ndimage.label`` will be applied to
        find regions, or a greyscale image with integer values indicating
        regions.

    Returns
    -------
    image : ND-array
        A copy of ``im`` with each voxel value indicating the size of the
        region to which it belongs.  This is particularly useful for finding
        chord sizes on the image produced by ``apply_chords``.
    """
    if im.dtype == bool:
        im = spim.label(im)[0]
    counts = sp.bincount(im.flatten())
    counts[0] = 0
    chords = counts[im]
    return chords
예제 #13
0
def update_kinship(self, removed_snps, full_kinship, full_indivs, full_num_snps, retained_indivs, kinship_type='ibs',
                   snps_data_format='binary', snp_dtype='int8', dtype='single'):
    assert kinship_type == 'ibs', 'Only IBS kinships can be updated at the moment'
    #Cut full kinship
    cut_kinship = prepare_k(full_kinship, full_indivs, retained_indivs)
    num_lines = cut_kinship.shape[0]
    k_mat = sp.zeros((num_lines, num_lines), dtype=dtype)
    num_snps = len(removed_snps)
    snps_array = sp.array(removed_snps, dtype=snp_dtype)
    snps_array = snps_array.T
    if snps_data_format == 'diploid_int':
        for i in range(num_lines):
            for j in range(i):
                bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i]))
                if len(bin_counts) > 1:
                    k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1])
                else:
                    k_mat[i, j] += bin_counts[0]
                k_mat[j, i] = k_mat[i, j]
    elif snps_data_format == 'binary':
        sm = sp.mat(snps_array * 2.0 - 1.0)
        k_mat = k_mat + sm * sm.T
    else:
        raise NotImplementedError
    if self.data_format == 'diploid_int':
        k_mat = k_mat / float(num_snps) + sp.eye(num_lines)
    elif self.data_format == 'binary':
        k_mat = k_mat / (2 * float(num_snps)) + 0.5

    updated_k = (cut_kinship * full_num_snps - k_mat * removed_snps) / (full_num_snps - removed_snps)
    return updated_k
예제 #14
0
def get_phenotypes(plinkf, debug=False):
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print('Unable to find phenotype values.')
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        if debug:
            print('Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]))
        has_phenotype = True
    else:
        if debug:
            print('Found quantitative phenotype values')
        has_phenotype = True
    return {
        'has_phenotype': has_phenotype,
        'fids': fids,
        'iids': iids,
        'phenotypes': Y,
        'num_individs': num_individs
    }
예제 #15
0
def _hough_transform(img, angles):
    rows, cols = img.shape

    # determine the number of bins
    d = sp.ceil(sp.hypot(*img.shape))
    nr_bins = 2 * d
    bins = sp.linspace(-d, d, nr_bins)

    # create the accumulator
    out = sp.zeros((nr_bins, len(angles)), dtype=sp.float64)

    # compute the sines/cosines
    cos_theta = sp.cos(angles)
    sin_theta = sp.sin(angles)

    # constructe the x and y values
    y = []
    x = []
    for i in xrange(rows):
        y += [i] * cols
        x += range(cols)
    y = sp.array(y)
    x = sp.array(x)

    # flatten image
    flattened_img = img.flatten()

    for i, (c, s) in enumerate(zip(cos_theta, sin_theta)):
        distances = x * c + y * s
        bin_indices = (sp.round_(distances) - bins[0]).astype(sp.uint8)
        bin_sums = sp.bincount(bin_indices, flattened_img)
        out[:len(bin_sums), i] = bin_sums

    return out
예제 #16
0
 def _calc_ibs_kinship_(self, dtype='single', chunk_size=None):
     n_snps = self.num_snps()
     n_indivs = self.num_individs()
     if chunk_size is None:
         chunk_size = n_indivs
     #print 'Allocating K matrix'
     k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype)
     #print 'Starting calculation'
     i = 0
     snps_chunks = self.snps_chunks(chunk_size)
     for snps_chunk in snps_chunks: #FINISH!!!
         i += len(snps_chunk)
         snps_array = snps_chunk.T
         if self.data_format == 'diploid_int':
             for i in range(n_indivs):
                 for j in range(i):
                     bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i]))
                     if len(bin_counts) > 1:
                         k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1])
                     else:
                         k_mat[i, j] += bin_counts[0]
                     k_mat[j, i] = k_mat[i, j]
         elif self.data_format == 'binary':
             sm = sp.mat(snps_array * 2.0 - 1.0)
             k_mat = k_mat + sm * sm.T
         sys.stdout.write('\b\b\b\b\b\b%0.1f%%' % (100.0 * i / n_snps))
         sys.stdout.flush()
     if self.data_format == 'diploid_int':
         k_mat = k_mat / float(n_snps) + sp.eye(n_indivs)
     elif self.data_format == 'binary':
         k_mat = k_mat / (2 * float(n_snps)) + 0.5
     return k_mat
예제 #17
0
파일: func.py 프로젝트: SamYoules/QSOStacks
def stack_flux(data, delta):
    '''Make a weighted sum of flux/delta values in wavelength bins.'''

    nstack = int((forest.lmax - forest.lmin) / forest.dll) + 1
    ll = forest.lmin + sp.arange(nstack) * forest.dll
    st = sp.zeros(nstack)
    wst = sp.zeros(nstack)
    data_bad_cont = []

    # Stack flux & weights, or deltas & weights
    for d in data:
        if d.bad_cont is not None:
            data_bad_cont.append(d)
            continue

        bins=((d.ll - d.lmin) / d.dll + 0.5).astype(int)
        eta = forest.eta(d.ll)
        var_lss = forest.var_lss(d.ll)
        fudge = forest.fudge(d.ll)

        if (delta == 0):
            # convert ivar into normalized ivar (going from flux units to F units)
            ivar_F = d.iv * d.co**2

            # correct this variance, adding the var_lss and eta factors
            var_F = 1./ivar_F
            var_F_tot = var_F*eta + var_lss + fudge/var_F

            # convert back to flux units
            var_flux_tot = var_F_tot * d.co**2 
            we = 1./var_flux_tot
            c = sp.bincount(bins, weights = d.fl * we)
        else:
            iv = d.iv / eta
            we = iv * d.co**2 / (iv * d.co**2 * var_lss + 1)
            c = sp.bincount(bins, weights = (d.fl/d.co - 1) * we)

        st[:len(c)] += c
        c = sp.bincount(bins, weights = we)
        wst[:len(c)] += c

    w = wst>0
    st[w] /= wst[w]
    for d in data_bad_cont:
        print ("rejected {} due to {}\n".format(d.thid,d.bad_cont))

    return ll, st, wst
예제 #18
0
    def getNumRequestsEachBus(self):
        # Assertion
        assert Solution.totalBuses is not None

        if self._numRequestsEachBus is None:
            self._numRequestsEachBus = scipy.bincount(self.getBusEachRequest(), minlength=Solution.totalBuses)

        return self._numRequestsEachBus
예제 #19
0
파일: rf_util.py 프로젝트: wuyi0614/X-BERT
 def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None):
     m = (sp.absolute(val).sum() + 1) * 3
     sorted_idx = sp.argsort(row_idx * m - val)
     row_idx[:] = row_idx[sorted_idx]
     col_idx[:] = col_idx[sorted_idx]
     val[:] = val[sorted_idx]
     indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1)))
     if only_topk is not None and isinstance(only_topk, int):
         only_topk = max(min(1, only_topk), only_topk)
         selected_idx = (sp.arange(len(val)) - indptr[row_idx]) < only_topk
         row_idx = row_idx[selected_idx]
         col_idx = col_idx[selected_idx]
         val = val[selected_idx]
     indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1)))
     return smat.csr_matrix((val, col_idx, indptr),
                            shape=shape,
                            dtype=val.dtype)
예제 #20
0
    def __add__(self, d):

        if not hasattr(self, 'll') or not hasattr(d, 'll'):
            return self

        dic = {
        }  # this should contain all quantities that are to be coadded with ivar weighting

        ll = sp.append(self.ll, d.ll)
        dic['fl'] = sp.append(self.fl, d.fl)
        iv = sp.append(self.iv, d.iv)

        if self.mmef is not None:
            dic['mmef'] = sp.append(self.mmef, d.mmef)
        if self.diff is not None:
            dic['diff'] = sp.append(self.diff, d.diff)
        if self.reso is not None:
            dic['reso'] = sp.append(self.reso, d.reso)

        bins = sp.floor((ll - forest.lmin) / forest.dll + 0.5).astype(int)
        cll = forest.lmin + sp.arange(bins.max() + 1) * forest.dll
        civ = sp.zeros(bins.max() + 1)
        cciv = sp.bincount(bins, weights=iv)
        civ[:len(cciv)] += cciv
        w = (civ > 0.)
        self.ll = cll[w]
        self.iv = civ[w]

        for k, v in dic.items():
            cnew = sp.zeros(bins.max() + 1)
            ccnew = sp.bincount(bins, weights=iv * v)
            cnew[:len(ccnew)] += ccnew
            setattr(self, k, cnew[w] / civ[w])

        # recompute means of quality variables
        if self.reso is not None:
            self.mean_reso = self.reso.mean()
        err = 1. / sp.sqrt(self.iv)
        SNR = self.fl / err
        self.mean_SNR = SNR.mean()
        lam_lya = constants.absorber_IGM["LYA"]
        self.mean_z = (sp.power(10., ll[len(ll) - 1]) +
                       sp.power(10., ll[0])) / 2. / lam_lya - 1.0

        return self
예제 #21
0
파일: xcf.py 프로젝트: iprafols/picca
def fill_dmat(l1, r1, rdm1, z1, w1, r2, rdm2, z2, w2, ang, wdm, dm, rpeff,
              rteff, zeff, weff):
    rp = (r1[:, None] - r2) * sp.cos(ang / 2)
    rt = (rdm1[:, None] + rdm2) * sp.sin(ang / 2)
    z = (z1[:, None] + z2) / 2.
    w = (rp > rp_min) & (rp < rp_max) & (rt < rt_max)

    bp = ((rp - rp_min) / (rp_max - rp_min) * np).astype(int)
    bt = (rt / rt_max * nt).astype(int)
    bins = bt + nt * bp
    bins = bins[w]

    m_bp = ((rp - rp_min) / (rp_max - rp_min) * npm).astype(int)
    m_bt = (rt / rt_max * ntm).astype(int)
    m_bins = m_bt + ntm * m_bp
    m_bins = m_bins[w]

    sw1 = w1.sum()
    ml1 = sp.average(l1, weights=w1)

    dl1 = l1 - ml1

    slw1 = (w1 * dl1**2).sum()

    n1 = len(l1)
    n2 = len(r2)
    ij = sp.arange(n1)[:, None] + n1 * sp.arange(n2)
    ij = ij[w]

    we = w1[:, None] * w2
    we = we[w]
    c = sp.bincount(bins, weights=we)
    wdm[:len(c)] += c
    eta2 = sp.zeros(npm * ntm * n2)
    eta4 = sp.zeros(npm * ntm * n2)

    c = sp.bincount(m_bins, weights=we * rp[w])
    rpeff[:c.size] += c
    c = sp.bincount(m_bins, weights=we * rt[w])
    rteff[:c.size] += c
    c = sp.bincount(m_bins, weights=we * z[w])
    zeff[:c.size] += c
    c = sp.bincount(m_bins, weights=we)
    weff[:c.size] += c

    c = sp.bincount((ij - ij % n1) // n1 + n2 * m_bins,
                    weights=(w1[:, None] * sp.ones(n2))[w] / sw1)
    eta2[:len(c)] += c
    c = sp.bincount((ij - ij % n1) // n1 + n2 * m_bins,
                    weights=((w1 * dl1)[:, None] * sp.ones(n2))[w] / slw1)
    eta4[:len(c)] += c

    ubb = sp.unique(m_bins)
    for k, (ba, m_ba) in enumerate(zip(bins, m_bins)):
        dm[m_ba + npm * ntm * ba] += we[k]
        i = ij[k] % n1
        j = (ij[k] - i) // n1
        for bb in ubb:
            dm[bb + npm * ntm *
               ba] -= we[k] * (eta2[j + n2 * bb] + eta4[j + n2 * bb] * dl1[i])
예제 #22
0
 def __init__(self, kdim, depth, algo, seed, codes):
     assert(kdim == 2)
     self.kdim = kdim
     self.depth = depth
     self.algo = algo
     self.seed = seed
     self.codes = codes
     self.indptr = sp.cumsum(sp.bincount(codes + 1, minlength=(self.nr_codes + 1)), dtype=sp.uint64)
     self.indices = sp.argsort(codes * sp.float64(self.nr_elements) + sp.arange(self.nr_elements))
예제 #23
0
 def is_near_constant(self, pid, min_num_diff=10):
     vals = sp.array(self.phen_dict[pid]["values"])
     if sp.std(vals) > 0:
         vals = 50 * (vals - sp.mean(vals)) / sp.std(vals)
         vals = vals - vals.min() + 0.1
         b_counts = sp.bincount(sp.array(sp.around(vals), dtype="int"))
         b = b_counts.max() > len(vals) - min_num_diff
         return b
     else:
         return True
예제 #24
0
파일: phenotype.py 프로젝트: timeu/PyGWAS
 def is_near_constant(self, min_num_diff=10):
     vals = sp.array(self.values)
     if sp.std(vals) > 0:
         vals = 50 * (vals - sp.mean(vals)) / sp.std(vals)
         vals = vals - vals.min() + 0.1
         b_counts = sp.bincount(sp.array(sp.around(vals), dtype='int'))
         b = b_counts.max() > len(vals) - min_num_diff
         return b
     else:
         return True
예제 #25
0
 def is_near_constant(self, min_num_diff=10):
     vals = sp.array(self.values)
     if sp.std(vals) > 0:
         vals = 50 * (vals - sp.mean(vals)) / sp.std(vals)
         vals = vals - vals.min() + 0.1
         b_counts = sp.bincount(sp.array(sp.around(vals), dtype='int'))
         b = b_counts.max() > len(vals) - min_num_diff
         return b
     else:
         return True
예제 #26
0
    def plot_marker_box_plot(self, pid, marker, m_accessions, m_position=None, m_chromosome=None, plot_file=None,
                plot_format='png', title=None, m_score=None):
        """
        Plots a box plot for the given binary marker and phenotype. 
        
        Assumes the marker is integer based.        
        Assumes the marker and the phenotype accessions are aligned.
        """
        phen_vals = self.get_values(pid)
        if len(m_accessions) != len(phen_vals):
            raise Exception

        nt_counts = sp.bincount(marker)
        if len(nt_counts) > 2:
            import warnings
            warnings.warn("More than 2 alleles, box-plot might be wrong?")

        allele_phen_val_dict = {}
        for nt in set(marker):
            allele_phen_val_dict[nt] = {'values':[], 'ecotypes':[]}

        for i, nt in enumerate(marker):
            allele_phen_val_dict[nt]['values'].append(phen_vals[i])
            if m_accessions:
                allele_phen_val_dict[nt]['ecotypes'].append(m_accessions[i])

        xs = []
        positions = []
        for nt in allele_phen_val_dict:
            positions.append(nt)
            xs.append(allele_phen_val_dict[nt]['values'])
        plt.figure()
        plt.boxplot(xs, positions=positions)
        min_val = min(phen_vals)
        max_val = max(phen_vals)
        val_range = max_val - min_val
        max_pos = max(positions)
        min_pos = min(positions)
        x_range = max_pos - min_pos
        plt.axis([min_pos - 0.5 * x_range, max_pos + 0.5 * x_range, min_val - val_range * 0.3, max_val + val_range * 0.3])
        plt.text(min_pos - 0.45 * x_range, min_val - 0.15 * val_range, "# of obs.: ", color='k')
        for i, (x, pos) in enumerate(it.izip(xs, positions)):
            plt.text(pos - 0.05, min_val - 0.15 * val_range, str(len(xs[i])), color='k')
        if m_score:
            plt.text(min_pos + 0.13 * x_range, max_val + 0.15 * val_range,
                '$-log_{10}$(p-value)/score: %0.2f' % m_score, color='k')
        if title:
            plt.title(title)
        elif m_chromosome and m_position:
            plt.title('%s : chromosome=%d, position=%d' % (self.get_name(pid), m_chromosome, m_position))
        if plot_file:
            plt.savefig(plot_file, format=plot_format)
        else:
            plt.show()
        plt.clf()
예제 #27
0
    def plot_marker_box_plot(self, pid, marker, m_accessions, m_position=None, m_chromosome=None, plot_file=None,
                plot_format='png', title=None, m_score=None):
        """
        Plots a box plot for the given binary marker and phenotype. 
        
        Assumes the marker is integer based.        
        Assumes the marker and the phenotype accessions are aligned.
        """
        phen_vals = self.get_values(pid)
        if len(m_accessions) != len(phen_vals):
            raise Exception

        nt_counts = sp.bincount(marker)
        if len(nt_counts) > 2:
            import warnings
            warnings.warn("More than 2 alleles, box-plot might be wrong?")

        allele_phen_val_dict = {}
        for nt in set(marker):
            allele_phen_val_dict[nt] = {'values':[], 'ecotypes':[]}

        for i, nt in enumerate(marker):
            allele_phen_val_dict[nt]['values'].append(phen_vals[i])
            if m_accessions:
                allele_phen_val_dict[nt]['ecotypes'].append(m_accessions[i])

        xs = []
        positions = []
        for nt in allele_phen_val_dict:
            positions.append(nt)
            xs.append(allele_phen_val_dict[nt]['values'])
        plt.figure()
        plt.boxplot(xs, positions=positions)
        min_val = min(phen_vals)
        max_val = max(phen_vals)
        val_range = max_val - min_val
        max_pos = max(positions)
        min_pos = min(positions)
        x_range = max_pos - min_pos
        plt.axis([min_pos - 0.5 * x_range, max_pos + 0.5 * x_range, min_val - val_range * 0.3, max_val + val_range * 0.3])
        plt.text(min_pos - 0.45 * x_range, min_val - 0.15 * val_range, "# of obs.: ", color='k')
        for i, (x, pos) in enumerate(it.izip(xs, positions)):
            plt.text(pos - 0.05, min_val - 0.15 * val_range, str(len(xs[i])), color='k')
        if m_score:
            plt.text(min_pos + 0.13 * x_range, max_val + 0.15 * val_range,
                '$-log_{10}$(p-value)/score: %0.2f' % m_score, color='k')
        if title:
            plt.title(title)
        elif m_chromosome and m_position:
            plt.title('%s : chromosome=%d, position=%d' % (self.get_name(pid), m_chromosome, m_position))
        if plot_file:
            plt.savefig(plot_file, format=plot_format)
        else:
            plt.show()
        plt.clf()
예제 #28
0
        def weight_angular(catalogue, nside=nside):

            self.logger.info('Angular integral constraint.')

            import healpy
            pixarea = healpy.nside2pixarea(nside, degrees=True)
            npix = healpy.nside2npix(nside)
            self.logger.info(
                'Pixels with nside = {:d}: {:.1f} square degree ({:d}).'.
                format(nside, pixarea, npix))

            #weights
            theta, phi = healpy.vec2ang(catalogue['Position'])
            ra, dec = phi / constants.degree, 90. - theta / constants.degree
            self.logger.info(
                'RA x DEC: [{:.1f}, {:.1f}] x [{:.1f}, {:.1f}].'.format(
                    ra.min(), ra.max(), dec.min(), dec.max()))
            pix = healpy.ang2pix(nside, theta, phi, nest=False)
            counts = scipy.bincount(pix, minlength=npix)
            mask = counts > 0
            nbins = mask.sum()
            self.logger.info(
                'There are {:d} pixels with an average of {:.1f} objects.'.
                format(nbins,
                       len(catalogue) * 1. / nbins))
            pixtoibin = -scipy.ones((npix), dtype=scipy.int64)
            pixtoibin[mask] = scipy.arange(nbins)

            for iaddbin in range(catalogue.attrs['naddbins']):
                mask = catalogue['iaddbin'] == iaddbin
                wcounts = scipy.bincount(pix[mask],
                                         weights=catalogue['Weight'][mask])
                catalogue['Weight'][mask] /= wcounts[pix[mask]]

            attrs = {'nside': nside, 'nbins': nbins}

            def bin(catalogue):
                theta, phi = healpy.vec2ang(catalogue['Position'])
                pix = healpy.ang2pix(nside, theta, phi, nest=False)
                return pixtoibin[pix]

            return attrs, bin
예제 #29
0
 def get_mafs(self):
     macs = []
     mafs = []
     num_nts = len(self.accessions)
     if self.data_format in ['binary', 'int']:
         for snp in self.get_snps_iterator():
             l = scipy.bincount(snp)
             mac = min(l)
             macs.append(mac)
             mafs.append(mac / float(num_nts))
     elif self.data_format == 'diploid_int':
         for snp in self.get_snps_iterator():
             bin_counts = scipy.bincount(snp, minlength=3)
             l = scipy.array([bin_counts[0], bin_counts[2]]) + bin_counts[1] / 2.0
             mac = l.min()
             macs.append(mac)
             mafs.append(mac / float(num_nts))
     else:
         raise NotImplementedError
     return {"macs":macs, "mafs":mafs}
예제 #30
0
파일: genotype.py 프로젝트: timeu/PyGWAS
 def get_mafs(self):
     macs = []
     mafs = []
     num_nts = len(self.accessions)
     if self.data_format in ['binary', 'int']:
         for snp in self.get_snps_iterator():
             l = scipy.bincount(snp)
             mac = min(l)
             macs.append(mac)
             mafs.append(mac / float(num_nts))
     elif self.data_format == 'diploid_int':
         for snp in self.get_snps_iterator():
             bin_counts = scipy.bincount(snp, minlength=3)
             l = scipy.array([bin_counts[0], bin_counts[2]]) + bin_counts[1] / 2.0
             mac = l.min()
             macs.append(mac)
             mafs.append(mac / float(num_nts))
     else:
         raise NotImplementedError
     return {"macs":macs, "mafs":mafs}
예제 #31
0
파일: rf_util.py 프로젝트: wuyi0614/X-BERT
 def coo_to_csr(coo):
     nr_rows, nr_cols, nnz, row, col, val = \
             coo.shape[0], coo.shape[1], coo.data.shape[0], coo.row, coo.col, coo.data
     indptr = sp.cumsum(sp.bincount(row + 1,
                                    minlength=(nr_rows + 1)),
                        dtype=sp.uint64)
     indices = sp.zeros(nnz, dtype=sp.uint32)
     data = sp.zeros(nnz, dtype=dtype)
     sorted_idx = sp.argsort(row * sp.float64(nr_cols) + col)
     indices[:] = col[sorted_idx]
     data[:] = val[sorted_idx]
     return indptr, indices, data
예제 #32
0
def mc(data):
    nmc = 100
    mcont = sp.zeros(nmc)
    wcont = sp.zeros(nmc)
    ll = forest.lmin_rest + (sp.arange(nmc) + .5) * (forest.lmax_rest -
                                                     forest.lmin_rest) / nmc
    for p in data:
        for d in data[p]:
            bins = ((d.ll - forest.lmin_rest - sp.log10(1 + d.zqso)) /
                    (forest.lmax_rest - forest.lmin_rest) * nmc).astype(int)
            var_lss = forest.var_lss(d.ll)
            we = d.iv / var_lss * d.co**2 / (d.iv + d.co**2 / var_lss)
            c = sp.bincount(bins, weights=d.fl / d.co * we)
            mcont[:len(c)] += c
            c = sp.bincount(bins, weights=we)
            wcont[:len(c)] += c

    w = wcont > 0
    mcont[w] /= wcont[w]
    mcont /= mcont.mean()
    return ll, mcont
예제 #33
0
파일: xcf.py 프로젝트: iprafols/picca
def fast_xcf(z1, r1, rdm1, w1, d1, z2, r2, rdm2, w2, ang):
    if ang_correlation:
        rp = r1[:, None] / r2
        rt = ang * sp.ones_like(rp)
    else:
        rp = (r1[:, None] - r2) * sp.cos(ang / 2)
        rt = (rdm1[:, None] + rdm2) * sp.sin(ang / 2)
    z = (z1[:, None] + z2) / 2

    we = w1[:, None] * w2
    wde = (w1 * d1)[:, None] * w2

    w = (rp > rp_min) & (rp < rp_max) & (rt < rt_max)
    rp = rp[w]
    rt = rt[w]
    z = z[w]
    we = we[w]
    wde = wde[w]

    bp = ((rp - rp_min) / (rp_max - rp_min) * np).astype(int)
    bt = (rt / rt_max * nt).astype(int)
    bins = bt + nt * bp

    cd = sp.bincount(bins, weights=wde)
    cw = sp.bincount(bins, weights=we)
    crp = sp.bincount(bins, weights=rp * we)
    crt = sp.bincount(bins, weights=rt * we)
    cz = sp.bincount(bins, weights=z * we)
    cnb = sp.bincount(bins, weights=(we > 0.))

    return cw, cd, crp, crt, cz, cnb
예제 #34
0
 def filter_mac_snps(self, min_mac=10):
     """
     Removes SNPs from the data which are have low macs.
     """
     snps_ix = []
     num_snps = self.num_snps
     for i,snp in enumerate(self.get_snps_iterator()):
         if self.data_format in ['binary', 'int']:
             l = scipy.bincount(snp)
             mac = l.min()
         elif self.data_format == 'diploid_int':
             bin_counts = scipy.bincount(snp, minlength=3)
             l = scipy.array([bin_counts[0], bin_counts[2]]) + bin_counts[1] / 2.0
             mac = l.min()
         else:
             mac=0
         if mac < min_mac:
             snps_ix.append(i)
     numRemoved = len(snps_ix)
     self.filter_snps_ix(snps_ix)
     log.info("Removed %d SNPs with mac below %d, out of %d SNPs in total." % (numRemoved, min_mac, num_snps))
     return (num_snps, numRemoved)
예제 #35
0
    def convert_codes_to_csc_matrix(codes, depth):
        nr_codes = 1 << depth
        nr_elements = len(codes)

        indptr = sp.cumsum(sp.bincount(codes + 1, minlength=(nr_codes + 1)),
                           dtype=sp.uint64)
        indices = sp.argsort(codes * sp.float64(nr_elements) +
                             sp.arange(nr_elements))
        C = smat.csc_matrix(
            (sp.ones_like(indices, dtype=sp.float32), indices, indptr),
            shape=(nr_elements, nr_codes),
        )
        return C
예제 #36
0
 def vertex_degrees(self):
     """Computes vertex degrees diagonal matrix
     d(v)=sum(w(e)), where e in E, v in e
     
     Returns
     -------
     d_v: sparse diagonal matrix
         sparse diagonal vertex degree matrix
     """
     return spsp.diags(
         sp.bincount(self.edge_list.flatten(),
                     weights=sp.array([[i] * self.k
                                       for i in self.weights]).flatten()))
예제 #37
0
파일: mnist.py 프로젝트: suji0131/MNIST
 def feval(self, x, average=True):
     '''I'm considering opt problem regarding parameters of each digit 
     independently i.e.., I have ten opt problems to solve. 
     loss_fn is an 10X1 matrix or vector'''
     data = self.data.train
     loss_fn = self.funcEval(x, data)
     if average == True:
         n_vals = sp.bincount(self.data.train[1]).astype(
             float)  #counts no of 1's 2's ..... in the dataset
         loss_fn = sp.divide(loss_fn, sp.reshape(n_vals, sp.shape(loss_fn)))
         return loss_fn
     else:
         return loss_fn
예제 #38
0
def calc_ibs_kinship(snps,
                     snps_data_format='binary',
                     snp_dtype='int8',
                     dtype='single',
                     chunk_size=None,
                     scaled=True):
    """
    Calculates IBS kinship
    
    data_format: two are currently supported, 'binary', and 'diploid_int'
    """
    num_snps = len(snps)
    # print 'Allocating K matrix'
    num_lines = len(snps[0])
    if chunk_size == None:
        chunk_size = num_lines
    k_mat = sp.zeros((num_lines, num_lines), dtype=dtype)
    # print 'Starting calculation'
    chunk_i = 0
    for snp_i in range(0, num_snps, chunk_size):  #FINISH!!!
        chunk_i += 1
        snps_array = sp.array(snps[snp_i:snp_i + chunk_size], dtype=snp_dtype)
        snps_array = snps_array.T
        if snps_data_format == 'diploid_int':
            for i in range(num_lines):
                for j in range(i):
                    bin_counts = sp.bincount(
                        sp.absolute(snps_array[j] - snps_array[i]))
                    if len(bin_counts) > 1:
                        k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1])
                    else:
                        k_mat[i, j] += bin_counts[0]
                    k_mat[j, i] = k_mat[i, j]
        elif snps_data_format == 'binary':
            sm = sp.mat(snps_array * 2.0 - 1.0)
            k_mat = k_mat + sm * sm.T
        else:
            raise NotImplementedError
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' %
                         (100.0 *
                          (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps))))
        sys.stdout.flush()
    print ''
    if snps_data_format == 'diploid_int':
        k_mat = k_mat / float(num_snps) + sp.eye(num_lines)
    elif snps_data_format == 'binary':
        k_mat = k_mat / (2 * float(num_snps)) + 0.5
    if scaled:
        k_mat = scale_k(k_mat)
    return k_mat
예제 #39
0
def calc_ibs_kinship(snps, snps_data_format='binary', snp_dtype='int8', dtype='single',
                     chunk_size=None, scaled=True):
    """
    Calculates IBS kinship
    
    data_format: two are currently supported, 'binary', and 'diploid_int'
    """
    num_snps = len(snps)
    #print 'Allocating K matrix'
    num_lines = len(snps[0])
    if chunk_size == None:
        chunk_size = num_lines
    k_mat = sp.zeros((num_lines, num_lines), dtype=dtype)
    #print 'Starting calculation'
    chunk_i = 0
    for snp_i in range(0, num_snps, chunk_size): #FINISH!!!
        chunk_i += 1
        snps_array = sp.array(snps[snp_i:snp_i + chunk_size], dtype=snp_dtype)
        snps_array = snps_array.T
        if snps_data_format == 'diploid_int':
            for i in range(num_lines):
                for j in range(i):
                    bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i]))
                    if len(bin_counts) > 1:
                        k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1])
                    else:
                        k_mat[i, j] += bin_counts[0]
                    k_mat[j, i] = k_mat[i, j]
        elif snps_data_format == 'binary':
            sm = sp.mat(snps_array * 2.0 - 1.0)
            k_mat = k_mat + sm * sm.T
        else:
            raise NotImplementedError
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps))))
        sys.stdout.flush()
    print ''
    if snps_data_format == 'diploid_int':
        k_mat = k_mat / float(num_snps) + sp.eye(num_lines)
    elif snps_data_format == 'binary':
        k_mat = k_mat / (2 * float(num_snps)) + 0.5
    if scaled:
        k_mat = scale_k(k_mat)
    return k_mat
예제 #40
0
파일: kinship.py 프로젝트: timeu/PyGWAS
def calc_ibs_kinship(genotype, snp_dtype='int8', dtype='single',chunk_size=None):
    """
    Calculates IBS kinship
    
    data_format: two are currently supported, 'binary', and 'diploid_int'
    """
    num_snps = genotype.num_snps
    num_lines = len(genotype.accessions)
    if chunk_size == None:
        chunk_size = num_lines
    k_mat = sp.zeros((num_lines, num_lines), dtype=dtype)
    log.info('Starting calculation of IBS kinship')
    chunk_i = 0
    snps = genotype.get_snps_iterator(is_chunked=True,chunk_size=chunk_size)
    snps_data_format = genotype.data_format
    for snps_chunk in snps:
        chunk_i += 1
        snps_array = sp.array(snps_chunk, dtype=snp_dtype)
        snps_array = snps_array.T
        if snps_data_format == 'diploid_int':
            for i in range(num_lines):
                for j in range(i):
                    bin_counts = sp.bincount(sp.absolute(snps_array[j] - snps_array[i]))
                    if len(bin_counts) > 1:
                        k_mat[i, j] += (bin_counts[0] + 0.5 * bin_counts[1])
                    else:
                        k_mat[i, j] += bin_counts[0]
                    k_mat[j, i] = k_mat[i, j]
        elif snps_data_format == 'binary':
            sm = sp.mat(snps_array * 2.0 - 1.0)
            k_mat = k_mat + sm * sm.T
        else:
            raise NotImplementedError
        log.debug('%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps))))
    if snps_data_format == 'diploid_int':
        k_mat = k_mat / float(num_snps) + sp.eye(num_lines)
    elif snps_data_format == 'binary':
        k_mat = k_mat / (2 * float(num_snps)) + 0.5
    log.info('Finished calculation')
    return k_mat
예제 #41
0
    def run(self, nbins=25):
        r"""
        Computes the pore size function of the image.

        This method calculates the distance transform of the void space, then
        computes a histogram of the occurances of each distance value.

        Parameters
        ----------
        nbins : int
            The number of bins into which the distance values should be sorted.
            The default is 25.

        """
        temp_img = spim.distance_transform_edt(self.image)
        dvals = temp_img[self.image].flatten()
        rmax = sp.amax(dvals)
        bins = sp.linspace(1, rmax, nbins)
        binned = sp.digitize(x=dvals, bins=bins)
        vals = namedtuple('PoreSizeFunction', ('distance', 'frequency'))
        vals.distance = bins
        vals.frequency = sp.bincount(binned, minlength=nbins)[1:]
        return vals
예제 #42
0
def knn(trainpoints, traincats, testpoints, k):
    """Given training data points
    and a 1-d array of the corresponding categories of the points,
    predict category for each test point,
    using k nearest neighbors (with cosine distance).
    Return a 1-d array of predicted categories.
    """
    # TODO: fill in

    testtraindist = cdist(testpoints, trainpoints, 'cosine') # pairwise distance between every test and train point
    print 'Computed pairwise distances'

    testtrainsort = scipy.argsort(testtraindist, axis=1)[:, :k] # for each row (test), column (train) indices sorted by distance in increasing order, and take first k
    print 'Sorted distances'

    numtest, numtrain = testtraindist.shape

    predictions = scipy.zeros(numtest)
    for i in range(numtest):
        predcats = traincats[testtrainsort[i, :]]
        catcounts = scipy.bincount(predcats)
        predictions[i] = scipy.argmax(catcounts)

    return predictions
예제 #43
0
def coordinate_genot_ss(genotype_file=None,
                        hdf5_file=None,
                        genetic_map_dir=None,
                        check_mafs=False,
                        min_maf =0.01):
    """
    Assumes plink BED files.  Imputes missing genotypes.
    """
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
#        num_individs = len(gf['chrom_1']['snps'][:, 0])
#     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens)==1:
        print 'Unable to find phenotype values.'
        has_phenotype=False
    elif len(unique_phens)==2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins)==2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1])
        has_phenotype=True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype=True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)
    
    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    #Figure out chromosomes and positions by looking at SNPs.  
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci] 

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    chr_dict = _get_chrom_dict_(loci, chromosomes)
    
    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d'%chrom
        print 'Working on chromsome: %s'%chr_str
        
        chrom_d = chr_dict[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        #Figure out filters:
        g_filter = sp.in1d(g_sids,ss_sids)
        ss_filter = sp.in1d(ss_sids,g_sids)

        #Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:
                g_indices.append(g_i)

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:
                ss_indices.append(ss_i)

        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets'%(len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list=[]
        
        ok_indices = {'g':[], 'ss':[]}
        for g_i, ss_i in it.izip(g_indices, ss_indices):
            
            #Is the nucleotide ambiguous?
            #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0],g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts +=1
                tot_num_non_matching_nts += 1
                continue
            
            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1                
                continue

            ss_nt = ss_nts[ss_i]
            #Are the nucleotides the same?
            flip_nts = False
            os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
            if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                # Opposite strand nucleotides
                flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                if flip_nts:
                    betas[ss_i] = -betas[ss_i]
                    log_odds[ss_i] = -log_odds[ss_i]
                    if 'freqs' in ssg.keys():
                        ss_freqs[ss_i] = 1-ss_freqs[ss_i]
                else:
#                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
#                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                        
                    continue

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['ss'].append(ss_i)
            ok_nts.append(g_nt)

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]
        
        #Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        print 'raw_snps.shape=', raw_snps.shape

        snp_stds = sp.sqrt(2*freqs*(1-freqs)) #sp.std(raw_snps, 1) 
        snp_means = freqs*2 #sp.mean(raw_snps, 1)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp)
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]
                
                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                 
        
        #Filter minor allele frequency SNPs.
        maf_filter = (freqs>min_maf)*(freqs<(1-min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum<=n_snps, "WTF?"
        if sp.sum(maf_filter)<n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
            
            
            print '%d SNPs with MAF < %0.3f were filtered'%(n_snps-maf_filter_sum,min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom)
        
        rb_prs = sp.dot(sp.transpose(raw_snps), log_odds)
        if has_phenotype:
            print 'Normalizing SNPs'
            snp_means.shape = (len(raw_snps),1)
            snp_stds.shape = (len(raw_snps),1)
            snps = (raw_snps - snp_means) / snp_stds
            assert snps.shape==raw_snps.shape, 'Aha!'
            snp_stds = snp_stds.flatten()
            snp_means = snp_means.flatten()
            prs = sp.dot(sp.transpose(snps), betas)
            corr = sp.corrcoef(Y, prs)[0, 1]
            corr_list.append(corr)
            print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr)
            rb_corr = sp.corrcoef(Y, rb_prs)[0, 1]
            rb_corr_list.append(rb_corr)
            print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (chrom, rb_corr)
        
        sid_set = set(sids)
        if genetic_map_dir is not None:
            genetic_map = [] 
            with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])
        
        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds)
        ofg.create_dataset('snp_means_ref', data=snp_means)
        ofg.create_dataset('freqs_ref', data=freqs)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
#         print 'Sum of squared effect sizes:', sp.sum(betas ** 2)
#         print 'Sum of squared log odds:', sp.sum(log_odds ** 2)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=rb_prs)
        if has_phenotype:
            risk_scores += prs
        rb_risk_scores += rb_prs
        num_common_snps += len(betas)
예제 #44
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None,
                                    reference_genotype_file = None,
                                    hdf5_file = None,
                                    genetic_map_dir=None,
                                    check_mafs=False,
                                    min_maf=0.01):
#   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) 
    plinkf = plinkfile.PlinkFile(genotype_file)
    
    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    
    unique_phens = sp.unique(Y)
    if len(unique_phens)==1:
        print 'Unable to find phenotype values.'
        has_phenotype=False
    elif len(unique_phens)==2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins)==2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1])
        has_phenotype=True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype=True

    #Figure out chromosomes and positions.  
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci] 

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    
    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()
    
    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
#     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)
    
    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)
    
    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []
    
    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0    
    tot_num_non_matching_nts = 0
   
    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g':[], 'rg':[], 'ss':[]}
        
        chr_str = 'chrom_%d'%chrom
        print 'Working on chromsome: %s'%chr_str
        
        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []
        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid]=i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid]=i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid]=i
            
        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])
        
        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])
        
        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])
            
        
        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
#         rg_nts_l = []
#         for nt in rg_nts_ok:
#             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
#         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count)
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count


        num_non_matching_nts = 0
        num_ambig_nts = 0


        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):
            
            #To make sure, is the SNP id the same?
            assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
            
            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
#             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0],g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts +=1
                tot_num_non_matching_nts += 1                
                continue
            
            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1                
                continue
            
            os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]                        
                        log_odds[ss_i] = -log_odds[ss_i]    
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1-ss_freqs[ss_i]
                    else:
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                    
                else:
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue
                    # Opposite strand nucleotides
            
           
            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)
#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)                

                        
        #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts 
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts 
        print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
#         order = sp.argsort(positions)
#         sorted_positions = positions[order]
#         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
#         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
#         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        
        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file.
        raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices)
        
        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref)
        
        
        snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) 
        snp_means_ref = freqs_ref*2

        snp_stds = sp.sqrt(2*freqs*(1-freqs)) 
        snp_means = freqs*2
        
        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)#[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
#         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
#         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
#         ss_sids = ssg['sids'][...][ok_indices['ss']]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp)
#                 print freqs[freq_discrepancy_snp]
#                 print ss_freqs[freq_discrepancy_snp]
                 
                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        
        
        #Filter minor allele frequency SNPs.
        maf_filter = (freqs>min_maf)*(freqs<(1-min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum<=n_snps, "WTF?"
        if sp.sum(maf_filter)<n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        
        
        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)

        genetic_map = [] 
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])
        
        
        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
#         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)
        
        
        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
예제 #45
0
    def find_neighbor_pores(self,pnums,flatten=True,mode='union',excl_self=False):
        r"""
        Returns a list of pores neighboring the given pore(s)

        Parameters
        ----------
        pnums : array_like
            ID numbers of pores whose neighbors are sought.
        flatten : boolean, optional
            If flatten is True (default) a 1D array of unique pore ID numbers
            is returned with the input pores (Pnum) removed. If flatten is
            False the returned array contains arrays of neighboring pores for
            each input pore, in the order they were sent.
        excl_self : bool, optional
            If this is True (default) then the input pores are not included
            in the returned list.  This option only applies when input pores
            are in fact neighbors to each other, otherwise they are not
            part of the returned list.  
        mode : string, optional
            Specifies which neighbors should be returned.  The options are: 
            
            * 'union' : All neighbors of the input pores

            * 'intersection' : Only neighbors shared by all input pores 
            
            * 'not_intersection' : Only neighbors not shared by any input pores

        Returns
        -------
        neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if flatten if False)

        Examples
        --------
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_pores(pnums=[0,2])
        array([ 1,  3,  5,  7, 25, 27])
        >>> pn.find_neighbor_pores(pnums=[0,1]) #Find all neighbors, excluding selves (default behavior)
        array([ 2,  5,  6, 25, 26])
        >>> pn.find_neighbor_pores(pnums=[0,2],flatten=False)
        array([array([ 1,  5, 25]), array([ 1,  3,  7, 27])], dtype=object)
        >>> pn.find_neighbor_pores(pnums=[0,2],mode='intersection') #Find only common neighbors
        array([1], dtype=int64)
        >>> pn.find_neighbor_pores(pnums=[0,2],mode='not_intersection') #Exclude common neighbors
        array([ 3,  5,  7, 25, 27], dtype=int64)
        >>> pn.find_neighbor_pores(pnums=[0,1],mode='union') #Find all neighbors, including selves
        array([ 0,  1,  2,  5,  6, 25, 26])
        """
        #Count neighboring pores
        try:
            neighborPs = self.adjacency_matrix['lil']['connections'].rows[[pnums]]
        except:
            self._logger.info('Creating adjacency matrix, please wait')
            self.create_adjacency_matrix()
            neighborPs = self.adjacency_matrix['lil']['connections'].rows[[pnums]]
        if flatten:
            #All the empty lists must be removed to maintain data type after hstack (numpy bug?)
            neighborPs = [sp.asarray(x) for x in neighborPs if x]
            neighborPs = sp.hstack(neighborPs)
            #neighborPs = sp.concatenate((neighborPs,pnums))
            #Remove references to input pores and duplicates
            if mode == 'not_intersection':
                neighborPs = sp.unique(sp.where(sp.bincount(neighborPs)==1)[0])
            elif mode == 'union':
                neighborPs = sp.unique(neighborPs)
            elif mode == 'intersection':
                neighborPs = sp.unique(sp.where(sp.bincount(neighborPs)>1)[0])
            if excl_self:
                neighborPs = neighborPs[~sp.in1d(neighborPs,pnums)]
        else:
            for i in range(0,sp.size(pnums)):
                neighborPs[i] = sp.array(neighborPs[i])
        return sp.array(neighborPs,ndmin=1)
예제 #46
0
    def _find_neighbors(self, pores, element, mode, flatten, excl_self):
        r"""
        Private method for finding the neighboring pores or throats connected
        directly to given set of pores.

        Parameters
        ----------
        pores : array_like
            The list of pores whose neighbors are sought
        element : string, either 'pore' or 'throat'
            Whether to find neighboring pores or throats
        mode : string
            Controls how the neighbors are filtered.  Options are:

            **'union'** : All neighbors of the input pores

            **'intersection'** : Only neighbors shared by all input pores

            **'not_intersection'** : Only neighbors not shared by any input
            pores

        flatten : boolean
            If flatten is True (default) a 1D array of unique neighbors is
            returned. If flatten is False the returned array contains arrays
            of neighboring throat ID numbers for each input pore, in the order
            they were sent.
        excl_self : bool
            When True the input pores are not included in the returned list of
            neighboring pores.  This option only applies when input pores are
            in fact neighbors to each other, otherwise they are not part of the
            returned list anyway.  This is ignored with the element is
            'throats'.

        See Also
        --------
        find_neighbor_pores
        find_neighbor_throats
        num_neighors

        """
        element = self._parse_element(element=element, single=True)
        pores = self._parse_locations(pores)
        if sp.size(pores) == 0:
            return sp.array([], ndmin=1, dtype=int)

        # Test for existence of incidence or adjacency matrix
        if element == 'pore':
            try:
                neighbors = self._adjacency_matrix['lil'].rows[[pores]]
            except:
                temp = self.create_adjacency_matrix(sprsfmt='lil')
                self._adjacency_matrix['lil'] = temp
                neighbors = self._adjacency_matrix['lil'].rows[[pores]]
        elif element == 'throat':
            try:
                neighbors = self._incidence_matrix['lil'].rows[[pores]]
            except:
                temp = self.create_incidence_matrix(sprsfmt='lil')
                self._incidence_matrix['lil'] = temp
                neighbors = self._incidence_matrix['lil'].rows[[pores]]

        if flatten:
            # Convert rows of lil into single flat list
            neighbors = itertools.chain.from_iterable(neighbors)
            if element == 'pore':  # Add input pores to list
                neighbors = itertools.chain.from_iterable([neighbors, pores])
            # Convert list to numpy array
            neighbors = sp.fromiter(neighbors, dtype=int)
            if mode == 'not_intersection':
                neighbors = sp.unique(sp.where(sp.bincount(neighbors) == 1)[0])
            elif mode == 'union':
                neighbors = sp.unique(neighbors)
            elif mode == 'intersection':
                neighbors = sp.unique(sp.where(sp.bincount(neighbors) > 1)[0])
            if excl_self and element == 'pore':  # Remove input pores from list
                neighbors = neighbors[~sp.in1d(neighbors, pores)]
            return sp.array(neighbors, ndmin=1, dtype=int)
        else:
            # Convert lists in array to numpy arrays
            neighbors = [sp.array(neighbors[i]) for i in range(0, len(pores))]
            return sp.array(neighbors, ndmin=1)
예제 #47
0
파일: 1kg_afs.py 프로젝트: bvilhjal/natsel
def gen_anc_afs_plot(ancestry = 'AFR', plot_prefix = '/Users/bjv/Dropbox/Cloud_folder/tmp/1kg_AFS_all', 
                     outfile_prefix='/Users/bjv/Dropbox/Cloud_folder/tmp/1kg_AFS_all',
                     data_filter=1, chunk_size = 10000):
    h5f = h5py.File('%s1k_genomes_hg.hdf5'%kg_dir)
    ancestries = sp.unique(h5f['indivs']['ancestry'][...])
    #ancestries = sp.unique(h5f['indivs']['continent'][...])
    
    for ancestry in ancestries:
        anc_filter = h5f['indivs']['ancestry'][...]==ancestry
        #anc_filter = h5f['indivs']['continent'][...]==ancestry
        num_indivs = sp.sum(anc_filter)
        print "%d individuals with %s ancestry are used"%(num_indivs,ancestry)
        
        sids_list = []
        acs = []
        for chrom in range(1,23):
            print 'Working on chromosome %d'%chrom
            chr_str = 'chr%d'%chrom
            num_snps = len(h5f[chr_str]['calldata']['snps'])
            assert num_snps==len(h5f[chr_str]['variants']['ID']), 'WTF?'
            for start_i in range(0,num_snps,chunk_size):
                snps  = sp.array(h5f[chr_str]['calldata']['snps'][start_i:start_i+chunk_size],dtype='int8')
                sids  = h5f[chr_str]['variants']['ID'][start_i:start_i+chunk_size]
                snps = snps[:,anc_filter]
                if data_filter<1:
                    rand_filt = sp.random.random(len(snps))
                    rand_filt = rand_filt<data_filter
                    snps = snps[rand_filt]
                    sids = sids[rand_filt]
                (m,n) = snps.shape
                ac = sp.sum(snps,1)
                flip_filter = ac>n
                ac[flip_filter]=2*n-ac[flip_filter]
                #Plotting filter 
                acs.extend(ac)
                sids_list.extend(sids)
                if start_i%1000000==0:
                    print 'Parsed %d SNPs'%start_i
            print '%d SNPs loaded and filtered'%num_snps
    
        print '%d ACs and SIDs found'%len(acs)
        print 'Storing the AFS'
        with open('%s_%s.txt'%(outfile_prefix,ancestry),'w') as f:
            f.write('# %d individuals used\n'%num_indivs)
            f.write('SID    AC\n')
            for sid, ac in izip(sids_list,acs):
                f.write('%s    %d\n'%(sid,ac))
    
        print 'Plot things'
        acs = sp.array(acs,dtype='int')
        acs = acs[acs>0]
        acs = acs[acs<30]
        min_ac = acs.min()
        max_ac = acs.max()
        sp.bincount(acs)
        plt.clf()
        plt.hist(acs, bins=sp.arange(min_ac-0.5, max_ac + 1.5, 1))
        plt.title('%s AFS'%ancestry)
        plt.savefig('%s_%s.png'%(plot_prefix,ancestry))
        
    h5f.close()
예제 #48
0
    def find_neighbor_pores(self, pores, mode='union', flatten=True, excl_self=True):
        r"""
        Returns a list of pores neighboring the given pore(s)

        Parameters
        ----------
        pores : array_like
            ID numbers of pores whose neighbors are sought.
        flatten : boolean, optional
            If flatten is True  a 1D array of unique pore ID numbers is
            returned. If flatten is False the returned array contains arrays
            of neighboring pores for each input pore, in the order they were
            sent.
        excl_self : bool, optional (Default is False)
            If this is True then the input pores are not included in the
            returned list.  This option only applies when input pores
            are in fact neighbors to each other, otherwise they are not
            part of the returned list anyway.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are:

            **'union'** : All neighbors of the input pores

            **'intersection'** : Only neighbors shared by all input pores

            **'not_intersection'** : Only neighbors not shared by any input pores

        Returns
        -------
        neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if
        flatten if False)

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_pores(pores=[0, 2])
        array([ 1,  3,  5,  7, 25, 27])
        >>> pn.find_neighbor_pores(pores=[0, 1])
        array([ 2,  5,  6, 25, 26])
        >>> pn.find_neighbor_pores(pores=[0, 1], mode='union', excl_self=False)
        array([ 0,  1,  2,  5,  6, 25, 26])
        >>> pn.find_neighbor_pores(pores=[0, 2], flatten=False)
        array([array([ 1,  5, 25]), array([ 1,  3,  7, 27])], dtype=object)
        >>> pn.find_neighbor_pores(pores=[0, 2], mode='intersection')
        array([1])
        >>> pn.find_neighbor_pores(pores=[0, 2], mode='not_intersection')
        array([ 3,  5,  7, 25, 27])
        """
        pores = self._parse_locations(pores)
        allowed_modes = ['union', 'intersection', 'not_intersection']
        mode = self._parse_mode(mode, allowed=allowed_modes, single=True)
        if sp.size(pores) == 0:
            return sp.array([], ndmin=1, dtype=int)
        # Test for existence of incidence matrix
        try:
            neighborPs = self._adjacency_matrix['lil'].rows[[pores]]
        except:
            temp = self.create_adjacency_matrix(sprsfmt='lil')
            self._adjacency_matrix['lil'] = temp
            neighborPs = self._adjacency_matrix['lil'].rows[[pores]]
        if flatten:
            # Convert rows of lil into single flat list
            neighborPs = itertools.chain.from_iterable(neighborPs)
            # Add input pores to list
            neighborPs = itertools.chain.from_iterable([neighborPs, pores])
            # Convert list to numpy array
            neighborPs = sp.fromiter(neighborPs, dtype=int)
            # Apply logic to include/exclude items of the set
            if mode == 'not_intersection':
                temp = sp.where(sp.bincount(neighborPs) == 1)[0]
                neighborPs = sp.unique(temp)
            elif mode == 'union':
                neighborPs = sp.unique(neighborPs)
            elif mode == 'intersection':
                temp = sp.where(sp.bincount(neighborPs) > 1)[0]
                neighborPs = sp.unique(temp)
            if excl_self:
                neighborPs = neighborPs[~sp.in1d(neighborPs, pores)]
            return sp.array(neighborPs, ndmin=1, dtype=int)
        else:
            # Convert lists in array to numpy arrays
            neighborPs = [sp.array(neighborPs[i]) for i in range(0, len(pores))]
            return sp.array(neighborPs, ndmin=1)
예제 #49
0
def load_eigenstrat_genotypes(in_file_prefix='eigenstrat_file_prefix',
                              out_file_prefix='hdf5_file_prefix',
                              impute_type='mode',
                              filter_monomorphic_snps=True,
                              missing_val_thr=0.1):
    """
    Parses eigenstrat formated genotype files to a HDF5 format.  It requires the h5py and scipy package.
    
    Ideally the genotypes are imputed apriory, otherwise a rough imputation 
    (the most common genotype) is used for missing genotypes.
    
    Notes: 
        Assumes the files are in diploid format!
    
    """
    import h5py
    import scipy as sp
    import os
    import sys
    
    data_file_prefix = '%s_mv%0.2f_imp_%s.' % (out_file_prefix, missing_val_thr, impute_type)    
     
    genotype_data = {}
    
    # Setting the HDF5 file up
    h5py_file_name = data_file_prefix + 'h5py'
    if os.path.isfile(h5py_file_name):
        print 'Overwriting: %s' % h5py_file_name
        os.remove(h5py_file_name)
    h5py_file = h5py.File(h5py_file_name)
    genotype_data['h5py_file'] = h5py_file_name
        
    
    # Fill out individuals data, if available
    i_filename = '%sind' % (in_file_prefix)
    if os.path.isfile(i_filename):
        iids = []
        phens = []
        genders = []
        with open(i_filename) as f:
            for line in f:
                l = (line.strip()).split()
                iids.append(l[0])
                genders.append(l[1])
                phens.append(l[2])
        ind_group = h5py_file.create_group('indivs')
        ind_group.create_dataset('indiv_ids', data=iids)
        ind_group.create_dataset('sex', data=genders)
        ind_group.create_dataset('phenotype', data=phens)
    else:
        print 'Individual information file not found: %s' % i_filename
        
    tot_num_snps = 0
    tot_num_duplicated_snps_removed = 0
    tot_num_missing_val_snps_removed = 0
    tot_num_monomorphic_snps_removed = 0
    
    
    # Open the genotype files.
    s_filename = '%ssnp' % (in_file_prefix) 
    g_filename = '%sgeno' % (in_file_prefix)
    print 'Starting to parse files:\n\t %s \n\t %s' % (s_filename, g_filename)
    sf = open(s_filename) 
    gf = open(g_filename) 
    

    # Figure out sample size, number of SNPs, etc. 
    # Initialize HDF5 file.

    # Setting up containers.
    curr_chrom = 1
    curr_hdf5_group = h5py_file.create_group('chrom_%d' % curr_chrom)
    snps_mat = []
    positions = []
    sids = []
    nts_list = []
    nt_counts_list = []
    missing_counts = []
    freqs = []
    num_missing_removed = 0
    num_monomorphic_removed = 0
    num_duplicated_snps_removed = 0

    print 'Starting to parse SNP files'
    for s_line in sf:
        g_line = gf.next()
        sl = s_line.split()
        pos = int(sl[3])
        chrom = int(sl[1])
        sid = sl[0]

        if chrom != curr_chrom:
            # Report statistics and store stuff
            print 'Finished with Chromosome %d' % curr_chrom
            print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed
            print 'Number of duplicated SNPs removed: %d' % num_duplicated_snps_removed
            print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed
            print 'Number of SNPs retained: %d' % len(positions)
            snps = sp.array(snps_mat, dtype='int8')
            curr_hdf5_group.create_dataset('raw_snps', compression='lzf', data=snps)
            h5py_file.flush()
            print 'Raw SNPs stored'
            snps = snps.T
            snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
            curr_hdf5_group.create_dataset('snps', compression='lzf', data=snps.T)
            h5py_file.flush()
            print 'Normalized SNPs stored'
            del snps
            del snps_mat
            curr_hdf5_group.create_dataset('positions', compression='lzf', data=positions)
            curr_hdf5_group.create_dataset('nts', compression='lzf', data=nts_list)
            curr_hdf5_group.create_dataset('nt_counts', compression='lzf', data=sp.array(nt_counts_list))
            curr_hdf5_group.create_dataset('missing_counts', compression='lzf', data=missing_counts)
            curr_hdf5_group.create_dataset('freqs', compression='lzf', data=freqs)
            curr_hdf5_group.create_dataset('snp_ids', compression='lzf', data=sids)        
            h5py_file.flush()
            sys.stdout.flush()

            # Reset containers
            curr_chrom = chrom
            curr_hdf5_group = h5py_file.create_group('chrom_%d' % curr_chrom)
            snps_mat = []
            positions = []
            sids = []
            nts_list = []
            nt_counts_list = []
            missing_counts = []
            freqs = []
            num_missing_removed = 0
            num_monomorphic_removed = 0
            num_duplicated_snps_removed = 0
            
        
        # Debug filter
                    
        nt = (sl[4], sl[5])

        snp = sp.array(map(int, g_line.strip()), dtype='int8')
        num_indiv = len(snp)
        bin_counts = sp.bincount(snp)
#        print bin_counts
        missing_count = bin_counts[-1]

        # Filtering SNPs with too many missing values
        if missing_count > missing_val_thr * 2 * num_indiv:
            num_missing_removed += 1
            tot_num_missing_val_snps_removed += 1
            continue

        nt_counts = list(bin_counts[:3])
        # Imputing the SNPs roughly by replacing missing values with the mode value.
        if impute_type == 'mode':
            v = sp.argmax(nt_counts)
            snp[snp == 9] = v
        else:
            raise Exception('Imputation type is unknown')

        bin_counts = sp.bincount(snp)
        nt_counts = list(bin_counts[:3])
        # Removing monomorphic SNPs
        if max(nt_counts) == sum(nt_counts):
            num_monomorphic_removed += 1
            tot_num_monomorphic_snps_removed += 1
            continue
        if len(nt_counts) == 2:
            nt_counts.append(0)
            
#        assert len(nt_counts) == 3, 'ARrrg'                    

        # Is this position already there?
        if len(positions) > 0 and pos == positions[-1]:
            num_duplicated_snps_removed += 1
            tot_num_duplicated_snps_removed += 1
            continue
        
        freq = sp.mean(snp) / 2.0            
        snps_mat.append(snp)
        positions.append(pos)
        sids.append(sid)
        nts_list.append(nt)
        nt_counts_list.append(nt_counts)
        missing_counts.append(missing_count)
        freqs.append(freq)

        tot_num_snps += 1
        


    # Report statistics and store stuff
    print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed
    print 'Number of duplicated SNPs removed: %d' % num_duplicated_snps_removed
    print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed
    print 'Number of SNPs retained: %d' % len(positions)
    snps = sp.array(snps_mat, dtype='int8')
    curr_hdf5_group.create_dataset('raw_snps', compression='lzf', data=snps)
    h5py_file.flush()
    print 'Raw SNPs stored'
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    curr_hdf5_group.create_dataset('snps', compression='lzf', data=snps.T)
    h5py_file.flush()
    print 'Normalized SNPs stored'
    del snps
    del snps_mat
    curr_hdf5_group.create_dataset('positions', compression='lzf', data=positions)
    curr_hdf5_group.create_dataset('nts', compression='lzf', data=nts_list)
    curr_hdf5_group.create_dataset('nt_counts', compression='lzf', data=sp.array(nt_counts_list))
    curr_hdf5_group.create_dataset('missing_counts', compression='lzf', data=missing_counts)
    curr_hdf5_group.create_dataset('freqs', compression='lzf', data=freqs)
    curr_hdf5_group.create_dataset('snp_ids', compression='lzf', data=sids)        
    
                
    gf.close()
    sf.close()
    
    print 'Genotypes for %d individuals were parsed.' % num_indiv
    print 'Total number of SNPs parsed successfully was: %d' % tot_num_snps
    print 'Total number of SNPs removed due to too many missing values: %d' % tot_num_missing_val_snps_removed
    print 'Total number of SNPs removed due to monomorphicity: %d' % tot_num_monomorphic_snps_removed
    print 'Total number of duplicated SNPs removed: %d' % tot_num_duplicated_snps_removed
    h5py_file.close()
    sys.stdout.flush()
    
    print 'Done parsing genotypes.'
예제 #50
0
def parse_plink_tped_file(file_prefix, imputation_type='simple', return_kinship=False):
    """
    Requires a .tped file in 12 format.
    
    - Converts (on-the-fly) to a integer format. 
    - Imputes missing data.
    """
    tped_filename = file_prefix + '.tped'
    tped_pickled_filename = tped_filename + '.imputed.pickled'
    tfam_filename = file_prefix + '.tfam'
    tfam_pickled_filename = tfam_filename + '.pickled'

    if os.path.isfile(tfam_pickled_filename):
        print 'Loading pickled tfam file'
        individs, sex_list = cPickle.load(open(tfam_pickled_filename))
        print 'Pickled tfam file was loaded.'
    else:
        individs = []
        sex_list = []
        with open(tfam_filename) as f:
            for line in f:
                l = map(str.strip, line.split())
                individs.append(l[1])
                sex_list.append(int(l[4]))
        cPickle.dump((individs, sex_list), open(tfam_pickled_filename, 'wb'), protocol=2)
    num_individs = len(individs)


#    k_mat = sp.zeros((num_individs, num_individs))
    if os.path.isfile(tped_pickled_filename):
        print 'Loading pickled tped file'
        chrom_pos_snp_dict = cPickle.load(open(tped_pickled_filename))
        print 'Pickled tped file was loaded.'
    else:
        chrom_pos_snp_dict = {}
        with open(tped_filename) as f:
            cur_chrom = -1
            for line_i, line in enumerate(f):
                if line_i % 1000 == 0:
                    print line_i
                l = map(str.strip, line.split())
                chrom = int(l[0])
                if chrom != cur_chrom:
                    chrom_pos_snp_dict[chrom] = {'positions':[], 'snps':[]}
                    cur_chrom = chrom
                chrom_pos_snp_dict[chrom]['positions'].append(int(l[3]))
                snp = sp.zeros(num_individs, dtype='int8')
                j = 0
                w_missing = False
                for i in range(4, 2 * num_individs + 4, 2):
                    nt1 = int(l[i])
                    nt2 = int(l[i + 1])
                    if nt1 == 0  or nt2 == 0:
                        snp[j] = 3
                        w_missing = True
                    elif nt1 == 2 and nt2 == 2:
                        snp[j] = 2
                    elif nt1 != 1  or nt2 != 1:
                        snp[j] = 1
#                    #Calculating K
#                    for ind_i in range(j):
#                        if snp[j] != 3 and snp[ind_i] != 3:
#                            k_mat[ind_i, j] = int(snp[j] == snp[ind_i]) + 0.5 * int(sp.absolute(snp[j] - snp[ind_i]) == 1)
#                            k_mat[ind_i, j] += 1
                    j += 1
#                print k_mat

                bin_counts = sp.bincount(snp)
                if w_missing:

                    if imputation_type == 'simple':
                        mean = (bin_counts[1] + 2 * bin_counts[2]) / (bin_counts[0] + bin_counts[1] + bin_counts[2])
                        snp[snp == 3] = round(mean)
                    if imputation_type == 'simple2':
                        snp[snp == 3] = sp.argmax(bin_counts[:-1])



                chrom_pos_snp_dict[chrom]['snps'].append(snp)
        cPickle.dump(chrom_pos_snp_dict, open(tped_pickled_filename, 'wb'), protocol=2)

    chromosomes = sorted(chrom_pos_snp_dict.keys())
    snpsds = []
    for chrom in chromosomes:
        snps = chrom_pos_snp_dict[chrom]['snps']
        positions = chrom_pos_snp_dict[chrom]['positions']
        snpsds.append(SNPsData(snps, positions, accessions=individs, chromosome=chrom))
    sd = SNPsDataSet(snpsds, chromosomes, data_format='diploid_int')
    print 'SNPsDataSet constructed!'

    if return_kinship:
        print 'Loading the kinship matrix'
        ibs_filename = file_prefix + '.mibs'
        ibs_pickled_filename = ibs_filename + '.pickled'
        if os.path.isfile(ibs_pickled_filename):
            print 'Loading pickled IBS kinship file'
            l = cPickle.load(open(ibs_pickled_filename))
            K = l[0]
            print 'Pickled IBS kinship was loaded.'
        else:
            print 'Loading K...'
            K = sp.zeros((num_individs, num_individs), dtype='double')
            with open(ibs_filename) as f:
                for i, line in enumerate(f):
                    K[i] = map(float, line.split())
            cPickle.dump([K, individs], open(ibs_pickled_filename, 'wb'), protocol=2)
            print 'K was loaded.'
        return sd, K
    return sd
예제 #51
0
def parse_single_12tped_to_hdf5(in_file_prefix='/home/bv25/data/Ls154/Ls154_12',
                         out_file_prefix='/home/bv25/data/Ls154/Ls154_12',
                         impute_type='mode', filter_monomorphic_snps=True,
                         missing_val_thr=0.1):
    """
    Parses plink 12 formatted tped  file and stores it in a HDF5 file.  It requires the h5py and scipy package.
    
    Ideally the genotypes are imputed apriory, otherwise a rough imputation 
    (the most common genotype) is used for missing genotypes.

    Notes: 
        Assumes the files are in diploid format!
    
    """
     
    print 'Starting to parse genotypes'
    genotype_data = {}
    h5py_file = h5py.File(out_file_prefix + '.hdf5')
    genotype_data['hdf5p_file'] = h5py_file
    genot_group = h5py_file.create_group('genot_data')
    indiv_group = h5py_file.create_group('indiv_data')
            
            
    tot_num_snps = 0
    tot_num_missing_val_snps_removed = 0
    tot_num_ambiguous_loc_removed = 0
    curr_chrom = 1
    print 'Working on chromosome %d' % curr_chrom
    
    g_filename = '%s.tped' % (in_file_prefix) 
    s_filename = '%s.bim' % (in_file_prefix)
    i_filename = '%s.tfam' % (in_file_prefix)  

        
    
    indiv_ids = []
    phenotypes = [] 
    sex = []
    print 'Parsing individuals file: %s' % i_filename
    with open(i_filename) as f:
        for line in f:
            l = line.split()
            iid = l[0]
            indiv_ids.append(iid)
            sex.append(int(l[4]))
            phenotypes.append(float(l[5]))
    tot_num_indiv = len(indiv_ids) 
    
    print 'Storing individual data in individ. group'
    indiv_group.create_dataset('indiv_ids', data=indiv_ids)
    indiv_group.create_dataset('sex', data=sex)
    indiv_group.create_dataset('phenotypes', data=phenotypes)
    
    
            
    num_indiv = len(indiv_ids)
    print 'Found %d Individuals' % (num_indiv)

    print 'Parsing nucleotide map'
    nt_map = {}
    chromsomoes = []
    curr_chrom = 0
    with open(s_filename) as f:
        for line in f:
            l = line.split()
            chrom = l[0]
            if chrom != curr_chrom:
                chromsomoes.append(chrom)
                curr_chrom = chrom
            nt_map[l[1]] = (l[4], l[5]) 
    assert len(chromsomoes) == len(set(chromsomoes)), 'Chromosomes need to be in order.'
    curr_chrom = chromsomoes[0]
        
    position = -1
    # Initializing containers.
    snps_mat = [] 
    positions = []
    sids = []
    nts_list = []
    nt_counts_list = []
    missing_counts = []
    freqs = []
    num_missing_removed = 0
    num_monomorphic_removed = 0
    num_ambiguous_loc_removed = 0
    t0 = time.time()

    print 'Starting to parse SNP files'
    gf = open(g_filename)
    for g_line in gf:
#        if random.random() > 0.01:
#            continue
        gl = g_line.split()
        chrom = gl[0]
        if chrom != curr_chrom:
            
            # Store everything and reset.
            print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed
            print 'Number of SNPs removed due to ambiguous location: %d' % num_ambiguous_loc_removed
            print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed
            print 'Number of SNPs retained: %d' % len(positions)
            print 'Number of individuals: %d' % num_indiv
            snps = sp.array(snps_mat, dtype='int8')
            h5py_chrom_group = genot_group.create_group('chrom_%s' % curr_chrom)
            h5py_chrom_group.create_dataset('raw_snps', compression='lzf', data=snps)
            h5py_chrom_group.create_dataset('positions', compression='lzf', data=positions)
            h5py_chrom_group.create_dataset('nts', compression='lzf', data=nts_list)
            h5py_chrom_group.create_dataset('nt_counts', compression='lzf', data=nt_counts_list)
            h5py_chrom_group.create_dataset('missing_counts', compression='lzf', data=missing_counts)
            h5py_chrom_group.create_dataset('freqs', compression='lzf', data=freqs)
            h5py_chrom_group.create_dataset('snp_ids', compression='lzf', data=sids)        
            tot_num_snps += len(positions)
            tot_num_missing_val_snps_removed += num_missing_removed
            tot_num_ambiguous_loc_removed += num_ambiguous_loc_removed
            h5py_file.flush()         
            t1 = time.time()
            t = t1 - t0
            print 'It took %d minutes and %0.2f seconds to parse Chromosome %s.' % (t / 60, t % 60, curr_chrom)
            t0 = time.time()

            

            # Reset containers
            snps_mat = [] 
            positions = []
            sids = []
            nts_list = []
            nt_counts_list = []
            missing_counts = []
            freqs = []
            num_missing_removed = 0
            num_ambiguous = 0
            num_monomorphic_removed = 0
            num_ambiguous_loc_removed = 0
               
            curr_chrom = chrom

        sid = gl[1]
        prev_position = position
        position = int(gl[3])

        # Skipping unmappable locations
        if position == prev_position:
            num_ambiguous_loc_removed += 1
            continue
        if position == 0:
            num_ambiguous_loc_removed += 1
            continue

        nt = nt_map[sid]
                
        snp0 = sp.array(map(int, (g_line.strip()).split()[4:]), 'int8')
        a = sp.arange(tot_num_indiv * 2)
        even_map = a % 2 == 0
        odd_map = a % 2 == 1
        snp = snp0[even_map] + snp0[odd_map] - 2
        snp[snp < 0] = 9
                   
        bin_counts = sp.bincount(snp)
        

        if len(bin_counts) > 3:
            missing_count = bin_counts[-1]
            # Filtering SNPs with too many missing values
            if missing_count > missing_val_thr * 2 * num_indiv:
                num_missing_removed += 1
                continue
            elif impute_type == 'mode':
                nt_counts = bin_counts[:3]                    
                v = sp.argmax(nt_counts)
                snp[snp == 9] = v
                bin_counts = sp.bincount(snp)
            else:
                raise Exception('Imputation type is unknown')
        else:
            missing_count = 0

        assert len(bin_counts) < 4, 'Issues with nucleotides.'
        nt_counts = bin_counts[:3]                    
        if len(nt_counts) == 2:
            nt_counts = sp.array([nt_counts[0], nt_counts[1], 0])
        elif len(nt_counts) == 1:
            nt_counts = sp.array([nt_counts[0], 0, 0])
            

        # Removing monomorphic SNPs
        if filter_monomorphic_snps:
            if max(nt_counts) == sum(nt_counts):
                num_monomorphic_removed += 1
                continue
        
        freq = sp.mean(snp) / 2.0            
        snps_mat.append(snp)
        positions.append(position)
        sids.append(sid)
        nts_list.append(nt)
        nt_counts_list.append(nt_counts)
        missing_counts.append(missing_count)
        freqs.append(freq) 

    # Store everything and reset.
    print 'Number of SNPs removed due to too many missing values: %d' % num_missing_removed
    print 'Number of SNPs removed due to ambiguous location: %d' % num_ambiguous_loc_removed
    print 'Number of monomorphic SNPs removed: %d' % num_monomorphic_removed
    print 'Number of SNPs retained: %d' % len(positions)
    print 'Number of individuals: %d' % num_indiv
    snps = sp.array(snps_mat, dtype='int8')
    h5py_chrom_group = genot_group.create_group('chrom_%s' % chrom)
    h5py_chrom_group.create_dataset('raw_snps', compression='lzf', data=snps)
    h5py_chrom_group.create_dataset('positions', compression='lzf', data=positions)
    h5py_chrom_group.create_dataset('nts', compression='lzf', data=nts_list)
    h5py_chrom_group.create_dataset('nt_counts', compression='lzf', data=nt_counts_list)
    h5py_chrom_group.create_dataset('missing_counts', compression='lzf', data=missing_counts)
    h5py_chrom_group.create_dataset('freqs', compression='lzf', data=freqs)
    h5py_chrom_group.create_dataset('snp_ids', compression='lzf', data=sids)        
    tot_num_snps += len(positions)
    tot_num_missing_val_snps_removed += num_missing_removed
    tot_num_ambiguous_loc_removed += num_ambiguous_loc_removed
    h5py_file.create_dataset('num_snps', data=sp.array(tot_num_snps))
    h5py_file.flush()         
    t1 = time.time()
    t = t1 - t0
    print 'It took %d minutes and %0.2f seconds to parse chromosome %s.' % (t / 60, t % 60, chrom)

    
    gf.close()
    
    print 'Total number of SNPs parsed successfully was: %d' % tot_num_snps
    print 'Total number of SNPs removed due to too many missing values: %d' % tot_num_missing_val_snps_removed
    print 'Total number of SNPs removed due to ambiguous locations: %d' % tot_num_ambiguous_loc_removed
    h5py_file.close()
    
    print 'Done parsing genotypes.'
예제 #52
0
    def find_neighbor_throats(self, pores, mode='union', flatten=True):
        r"""
        Returns a list of throats neighboring the given pore(s)

        Parameters
        ----------
        pores : array_like
            Indices of pores whose neighbors are sought
        flatten : boolean, optional
            If flatten is True (default) a 1D array of unique throat ID numbers
            is returned. If flatten is False the returned array contains arrays
            of neighboring throat ID numbers for each input pore, in the order
            they were sent.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are:

            * 'union' : All neighbors of the input pores

            * 'intersection' : Only neighbors shared by all input pores

            * 'not_intersection' : Only neighbors not shared by any input pores

        Returns
        -------
        neighborTs : 1D array (if flatten is True) or ndarray of arrays (if
            flatten if False)

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_throats(pores=[0, 1])
        array([0, 1, 2, 3, 4, 5])
        >>> pn.find_neighbor_throats(pores=[0, 1],flatten=False)
        array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object)
        """
        pores = sp.array(pores, ndmin=1)
        if pores.dtype == bool:
            pores = self.toindices(pores)
        if sp.size(pores) == 0:
            return sp.array([], ndmin=1, dtype=int)
        # Test for existence of incidence matrix
        try:
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        except:
            temp = self.create_incidence_matrix(sprsfmt='lil')
            self._incidence_matrix['lil'] = temp
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        if [sp.asarray(x) for x in neighborTs if x] == []:
            return sp.array([], ndmin=1)
        if flatten:
            # All the empty lists must be removed to maintain data type after
            # hstack (numpy bug?)
            neighborTs = [sp.asarray(x) for x in neighborTs if x]
            neighborTs = sp.hstack(neighborTs)
            # Remove references to input pores and duplicates
            if mode == 'not_intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) == 1)[0])
            elif mode == 'union':
                neighborTs = sp.unique(neighborTs)
            elif mode == 'intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) > 1)[0])
        else:
            for i in range(0, sp.size(pores)):
                neighborTs[i] = sp.array(neighborTs[i])
        return sp.array(neighborTs, ndmin=1)
예제 #53
0
    def find_neighbor_throats(self, pores, mode='union', flatten=True):
        r"""
        Returns a list of throats neighboring the given pore(s)

        Parameters
        ----------
        pores : array_like
            Indices of pores whose neighbors are sought
        flatten : boolean, optional
            If flatten is True (default) a 1D array of unique throat ID numbers
            is returned. If flatten is False the returned array contains arrays
            of neighboring throat ID numbers for each input pore, in the order
            they were sent.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are:

            **'union'** : All neighbors of the input pores

            **'intersection'** : Only neighbors shared by all input pores

            **'not_intersection'** : Only neighbors not shared by any input pores

        Returns
        -------
        neighborTs : 1D array (if flatten is True) or ndarray of arrays (if
            flatten if False)

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_throats(pores=[0, 1])
        array([0, 1, 2, 3, 4, 5])
        >>> pn.find_neighbor_throats(pores=[0, 1],flatten=False)
        array([array([0, 1, 2]), array([0, 3, 4, 5])], dtype=object)
        """
        pores = self._parse_locations(pores)
        if sp.size(pores) == 0:
            return sp.array([], ndmin=1, dtype=int)
        # Test for existence of incidence matrix
        try:
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        except:
            temp = self.create_incidence_matrix(sprsfmt='lil')
            self._incidence_matrix['lil'] = temp
            neighborTs = self._incidence_matrix['lil'].rows[[pores]]
        if flatten:
            # Convert rows of lil into single flat list
            neighborTs = itertools.chain.from_iterable(neighborTs)
            # Convert list to numpy array
            neighborTs = sp.fromiter(neighborTs, dtype=int)
            if mode == 'not_intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) == 1)[0])
            elif mode == 'union':
                neighborTs = sp.unique(neighborTs)
            elif mode == 'intersection':
                neighborTs = sp.unique(sp.where(sp.bincount(neighborTs) > 1)[0])
            return sp.array(neighborTs, ndmin=1, dtype=int)
        else:
            # Convert lists in array to numpy arrays
            neighborTs = [sp.array(neighborTs[i]) for i in range(0, len(pores))]
            return sp.array(neighborTs, ndmin=1)
예제 #54
0
파일: 2b.py 프로젝트: jpdiazp/Tarea1
import scipy as sy

lista2=[]
c=0

for z in range(1000):
    lista=[]
    for i in range(33):
        x = sy.random.random_integers(2)
        lista.append(x)
    lista2.append(lista)
    if (sy.bincount(lista)[1])==18:
        c+=1
    
print(c)
print(float(c)/1000) #porcentaje de la observacion
예제 #55
0
    def find_neighbor_pores(self, pores, mode='union', flatten=True, excl_self=True):
        r"""
        Returns a list of pores neighboring the given pore(s)

        Parameters
        ----------
        pores : array_like
            ID numbers of pores whose neighbors are sought.
        flatten : boolean, optional
            If flatten is True  a 1D array of unique pore ID numbers is
            returned. If flatten is False the returned array contains arrays
            of neighboring pores for each input pore, in the order they were
            sent.
        excl_self : bool, optional (Default is False)
            If this is True then the input pores are not included in the
            returned list.  This option only applies when input pores
            are in fact neighbors to each other, otherwise they are not
            part of the returned list anyway.
        mode : string, optional
            Specifies which neighbors should be returned.  The options are:

            * 'union' : All neighbors of the input pores

            * 'intersection' : Only neighbors shared by all input pores

            * 'not_intersection' : Only neighbors not shared by any input pores

        Returns
        -------
        neighborPs : 1D array (if flatten is True) or ndarray of ndarrays (if
        flatten if False)

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_neighbor_pores(pores=[0, 2])
        array([ 1,  3,  5,  7, 25, 27])
        >>> pn.find_neighbor_pores(pores=[0, 1])
        array([ 2,  5,  6, 25, 26])
        >>> pn.find_neighbor_pores(pores=[0, 1], mode='union', excl_self=False)
        array([ 0,  1,  2,  5,  6, 25, 26])
        >>> pn.find_neighbor_pores(pores=[0, 2],flatten=False)
        array([array([ 1,  5, 25]), array([ 1,  3,  7, 27])], dtype=object)
        >>> pn.find_neighbor_pores(pores=[0, 2],mode='intersection')
        array([1])
        >>> pn.find_neighbor_pores(pores=[0, 2],mode='not_intersection')
        array([ 3,  5,  7, 25, 27])
        """
        pores = sp.array(pores, ndmin=1)
        if pores.dtype == bool:
            pores = self.toindices(pores)
        if sp.size(pores) == 0:
            return sp.array([], ndmin=1, dtype=int)
        # Test for existence of incidence matrix
        try:
            neighborPs = self._adjacency_matrix['lil'].rows[[pores]]
        except:
            temp = self.create_adjacency_matrix(sprsfmt='lil')
            self._adjacency_matrix['lil'] = temp
            neighborPs = self._adjacency_matrix['lil'].rows[[pores]]
        if [sp.asarray(x) for x in neighborPs if x] == []:
            return sp.array([], ndmin=1)
        if flatten:
            # All the empty lists must be removed to maintain data type after
            # hstack (numpy bug?)
            neighborPs = [sp.asarray(x) for x in neighborPs if x]
            neighborPs = sp.hstack(neighborPs)
            neighborPs = sp.concatenate((neighborPs, pores))
            # Remove references to input pores and duplicates
            if mode == 'not_intersection':
                neighborPs = sp.array(sp.unique(sp.where(
                    sp.bincount(neighborPs) == 1)[0]), dtype=int)
            elif mode == 'union':
                neighborPs = sp.array(sp.unique(neighborPs), int)
            elif mode == 'intersection':
                neighborPs = sp.array(sp.unique(sp.where(
                    sp.bincount(neighborPs) > 1)[0]), dtype=int)
            if excl_self:
                neighborPs = neighborPs[~sp.in1d(neighborPs, pores)]
        else:
            for i in range(0, sp.size(pores)):
                neighborPs[i] = sp.array(neighborPs[i], dtype=int)
        return sp.array(neighborPs, ndmin=1)
def itemfreq(a):
    items,ind, inv = sp.unique(a, return_inverse=True,return_index=True)
    freq = sp.bincount(inv)
    return sp.array([ind, items, freq]).T