Пример #1
0
def loadData(bfile,
             extractSim,
             phenoFile,
             missingPhenotype='-9',
             loadSNPs=False,
             standardize=True):
    bed = Bed(bfile)

    if (extractSim is not None):
        f = open(extractSim)
        csvReader = csv.reader(f)
        extractSnpsSet = set([])
        for l in csvReader:
            extractSnpsSet.add(l[0])
        f.close()
        keepSnpsInds = [
            i for i in xrange(bed.sid.shape[0]) if bed.sid[i] in extractSnpsSet
        ]
        bed = bed[:, keepSnpsInds]

    phe = None
    if (phenoFile is not None):
        bed, phe = loadPheno(bed, phenoFile, missingPhenotype)

    if (loadSNPs):
        bed = bed.read()
        if (standardize): bed = bed.standardize()

    return bed, phe
Пример #2
0
def load_data(snp_file, pheno_file, covar_file):
    # Load SNP data
    snp_reader = Bed(snp_file)

    # Load phenotype
    pheno = pysnptools.util.pheno.loadPhen(pheno_file)

    # Load covariates
    if covar_file is not None:
        covar = pysnptools.util.pheno.loadPhen(covar_file)
        snp_reader, pheno, covar = srutil.intersect_apply([snp_reader, pheno, covar])
        covar = covar['vals']
    else:
        snp_reader, pheno = srutil.intersect_apply([snp_reader, pheno])
        covar = None

    snp_data = snp_reader.read().standardize()
    Y = pheno['vals']
    Y -= Y.mean(0)
    Y /= Y.std(0)

    X = 1./np.sqrt((snp_data.val**2).sum() / float(snp_data.iid_count)) * snp_data.val
    K = np.dot(X, X.T) # TODO use symmetric dot to speed this up

    assert np.all(pheno['iid'] == snp_data.iid), "the samples are not sorted"

    return snp_data, pheno, covar, X, Y, K
def write_grm(K, out_file, num_snps=500000, K_index=None, bed_nold_file=None):
    # fill GRM columns 0,1,3
    n = K.shape[0]

    grm = np.zeros((n * (n - 1) / 2 + n, 4))
    tril_indices = np.tril_indices(n)
    grm[:, 0] = tril_indices[0] + 1
    grm[:, 1] = tril_indices[1] + 1
    grm[:, 2] = num_snps
    grm[:, 3] = K[tril_indices]

    # fill #non-missing SNPs columns
    if bed_nold_file is not None:
        assert K_index is not None, 'K_index not provided'
        from pysnptools.snpreader.bed import Bed
        bed_nold = Bed(bed_nold_file, count_A1=True).read()
        notNan = (~np.isnan(bed_nold.val)).astype(np.float)
        notNan_K = notNan.dot(notNan.T)
        id2ind = dict([])
        for ind_i, ind in enumerate(bed_nold.iid[:, 1].astype(np.int)):
            id2ind[ind] = ind_i
        tril_indices_nold = [None, None]
        tril_indices_nold[0] = [id2ind[K_index[ind]] for ind in tril_indices[0]]
        tril_indices_nold[1] = [id2ind[K_index[ind]] for ind in tril_indices[1]]
        grm[:, 2] = notNan_K[tril_indices_nold]

    pd_grm = pd.DataFrame(grm, columns=['i', 'j', 'num_SNPs', 'K'])
    pd_grm['i'] = pd_grm['i'].astype(np.int)
    pd_grm['j'] = pd_grm['j'].astype(np.int)
    pd_grm.to_csv(out_file, compression='gzip', header=False, index=False, sep='\t', float_format='%0.6e')
Пример #4
0
def loadData(bfile, extractSim, phenoFile, missingPhenotype='-9', loadSNPs=False, standardize=True):
	bed = Bed(bfile)
	
	if (extractSim is not None):
		f = open(extractSim)
		csvReader = csv.reader(f)
		extractSnpsSet = set([])
		for l in csvReader: extractSnpsSet.add(l[0])			
		f.close()		
		keepSnpsInds = [i for i in xrange(bed.sid.shape[0]) if bed.sid[i] in extractSnpsSet]		
		bed = bed[:, keepSnpsInds]
		
	phe = None
	if (phenoFile is not None):	bed, phe = loadPheno(bed, phenoFile, missingPhenotype)
	
	if (loadSNPs):
		bed = bed.read()
		if (standardize): bed = bed.standardize()	
	
	return bed, phe
Пример #5
0
def _snps_fixup(snp_input, iid_if_none=None,count_A1=None):
    if isinstance(snp_input, str):
        return Bed(snp_input,count_A1=count_A1)

    if isinstance(snp_input, dict):
        return SnpData(iid=snp_input['iid'],sid=snp_input['header'],val=snp_input['vals'])

    if snp_input is None:
        assert iid_if_none is not None, "snp_input cannot be None here"
        return SnpData(iid_if_none, sid=np.empty((0),dtype='str'), val=np.empty((len(iid_if_none),0)),pos=np.empty((0,3)),name="") #todo: make a static factory method on SnpData

    return snp_input
Пример #6
0
def getChromosome(bfile, chrom):
    bed = Bed(bfile)
    indsToKeep = (bed.pos[:, 0] == chrom)
    bed = bed[:, indsToKeep]
    return bed.read().standardize()
Пример #7
0
def _fixupBed(bed):
    if isinstance(bed, str):
        return Bed(bed).read().standardize()
    else:
        return bed
Пример #8
0
def getChromosome(bfile, chrom):
	bed = Bed(bfile)
	indsToKeep = (bed.pos[:,0] == chrom)
	bed = bed[:, indsToKeep]	
	return bed.read().standardize()
Пример #9
0
def getExcludedChromosome(bfile, chrom):
    bed = Bed(bfile)
    indsToKeep = bed.pos[:, 0] != chrom
    bed = bed[:, indsToKeep]
    return bed.read().standardize()
Пример #10
0
import pandas as pd
import numpy as np
import leap.leapUtils as leapUtils
import leap.leapMain as leapMain
from pysnptools.snpreader.bed import Bed

#Define analysis data
bfile = 'dataset1/dataset1'
phenoFile = bfile + '.phe'
chromosomes = range(1, 11)
prevalence = 0.001

#Find individuals to exclude to eliminate relatedness (kinship coeff > 0.05)
bed = Bed(bfile).read().standardize()
indsToKeep = leapUtils.findRelated(bed, cutoff=0.05)

#Iterate over each chromosome
frame_list = []
for chrom in chromosomes:
    print()
    print('Analyzing chromosome', chrom, '...')

    #Create a bed object excluding SNPs from the current chromosome
    bedExclude = leapUtils.getExcludedChromosome(bfile, chrom)

    #Create a bed object including only SNPs from the current chromosome
    bedTest = leapUtils.getChromosome(bfile, chrom)

    #Compute eigendecomposition for the data
    eigenFile = 'temp_eigen.npz'
    eigen = leapMain.eigenDecompose(bedExclude, outFile=eigenFile)
Пример #11
0
def getExcludedChromosome(bfile, chrom):
    bed = Bed(bfile, count_A1=True)
    indsToKeep = (bed.pos[:, 0] != chrom)
    bed = bed[:, indsToKeep]
    return bed.read().standardize()