Пример #1
0
def readFiles(files, fileType='beagle', chrom=None):
    nFiles=len(files)
    if fileType=='beagle':
        files=fileReader.concurrentFileReader(*files)
        subjects=files.next()[0]
    elif fileType=='tped':
        tfams=[f.replace('.tped', '.tfam') for f in files]
        tfams=[fileReader.openfile(f) for f in tfams]
        subjects=[]
        for f in tfams:
            subs=[[l.split(None, 1)[0]+'_a',l.split(None, 1)[0]+'_b']  for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files=fileReader.concurrentFileReader(*files, nHeaders=0, key=[0,1], nLabels=4)
    else:
        sys.stderr.write('ERROR: Filetype has to be either beagle or tped')
        sys.exit()
    snpNames=[]; snpPos=[];  pops=[[] for i in range(nFiles)]
    for s, l in files:
        if fileType=='tped':
            if chrom!=None and chrom!=s[0]:
                continue
            s=[s[1], s[3]]
        snpNames.append(s[0])
        snpPos.append(int(s[1]))
        for i in range(nFiles):
            pops[i].append(l[i])
    nSNPs=len(snpNames)
    pops=map(np.asarray, pops)
    nPops=[l.shape[1] for l in pops]
    return pops,  nPops, subjects, nSNPs, snpPos, snpNames
Пример #2
0
def readFiles(fileNames, isBeagle=True):
    snpNames = []
    snpLocations = []  #stores physical location from files
    vals = []  #Stores Values of genotypes

    if isBeagle:
        files = fileReader.concurrentFileReader(*fileNames, key=0)
        subjects = files.next()[0]
    else:
        tfams = [f.replace('.tped', '.tfam') for f in fileNames]
        tfams = [fileReader.openfile(f) for f in tfams]
        subjects = []
        for f in tfams:
            subs = [[l.split(None, 2)[1] + '_a',
                     l.split(None, 2)[1] + '_b'] for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files = fileReader.concurrentFileReader(*fileNames,
                                                nHeaders=0,
                                                key=[0, 1],
                                                nLabels=4)
    labels = np.asarray(
        sum([[i] * len(sub) for i, sub in enumerate(subjects)], []))
    for i, (snpInfo, snps) in enumerate(files):
        if isBeagle:
            snpLocations.append(float(snpInfo[1]))
            snpNames.append(snpInfo[0])
        else:
            snpLocations.append(float(snpInfo[3]))
            snpNames.append(snpInfo[1])
        vals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))
    vals = np.asarray(vals).T
    snpLocations = np.asarray(snpLocations)
    return subjects, labels, snpNames, snpLocations, vals
Пример #3
0
def readFiles(fileNames, isBeagle=True):
    snpNames=[]
    snpLocations=[]  #stores physical location from files
    vals=[]          #Stores Values of genotypes

    if isBeagle:
        files=fileReader.concurrentFileReader(*fileNames, key=0)
        subjects=files.next()[0]
    else:
        tfams=[f.replace('.tped', '.tfam') for f in fileNames]
        tfams=[fileReader.openfile(f) for f in tfams]
        subjects=[]
        for f in tfams:
            subs=[[l.split(None, 2)[1]+'_a',l.split(None, 2)[1]+'_b']  for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files=fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0,1], nLabels=4)
    labels=np.asarray(sum([[i]*len(sub) for i, sub in enumerate(subjects)], []))
    for i, (snpInfo,snps) in enumerate(files):
        if isBeagle:
            snpLocations.append(float(snpInfo[1]))
            snpNames.append(snpInfo[0])
        else:
            snpLocations.append(float(snpInfo[3]))
            snpNames.append(snpInfo[1])
        vals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))
    vals=np.asarray(vals).T
    snpLocations=np.asarray(snpLocations)
    return subjects, labels, snpNames, snpLocations, vals
Пример #4
0
def classify(fileNames,
             smoother,
             classifier=regionClassifier.SVMpymvpa(C),
             win_size=100,
             CHR='chr1',
             mapFile='data/hapmap2/genetic_map_%(CHR)s_b36.txt'):
    """Deconvolves ancestry in last file based on ancestral
    populations in first files.
    Arguments:
    - `fileNames`: list of fileNames
    - `win_size`: number of snps in each window (defualt=100)
    Returns:
    - `ancestralSuccess`: success of cross validation in ancestral populations
    - `admixedClassPre`: classification of admixed samples before hmm filter
    - `admixedClass`:    classification of admixed samples after hmm filter
    """
    snpLocations = []  #stores physical location from files
    snpNames = []  #stores physical location from files
    ancestralSuccess = []  #stores success of ancestral classification
    admixedClass = []  #stores classification of test Subjects
    files = fileReader.concurrentFileReader(*fileNames, key=1)

    subjects = files.next()[0]
    nTrain = np.sum(map(len,
                        subjects[:-1]))  #Number of samples in training set
    nTest = len(subjects[-1])
    labelsTrain = sum([[i] * len(sub) for i, sub in enumerate(subjects[:-1])],
                      [])
    vals = np.zeros((nTrain + nTest, win_size))  #temporary storage of output
    while True:
        rsIds = []
        pos = []
        for i, ([snpName, snpLocation], snps) in enumerate(files):
            pos.append(float(snpLocation))
            rsIds.append(snpName)
            vals[:, i] = fileReader.nucleotides2Haplotypes(sum(snps, []))
            if i == vals.shape[1] - 1:
                break
        snpLocations.append(pos)
        snpNames.append(rsIds)
        #print  len(snpLocations), snpLocations[-1][0], '->', snpLocations[-1][-1], snpNames[-1][0], '->', snpNames[-1][-1]
        ancestral, admixed = classifier(vals[:nTrain, :i + 1], labelsTrain,
                                        vals[-nTest:, :i + 1])
        ancestralSuccess.append(ancestral)
        admixedClass.append(admixed)
        if i < win_size - 1:
            break
    admixedClassPre = np.array(admixedClass)

    #Figure out mapLocations
    gm = popgen.geneticMap(mapFile % locals())
    mapLocations = gm.pos2gm(np.hstack(snpLocations))
    admixedClass, p = smoother(mapLocations, ancestralSuccess, admixedClassPre)
    return admixedClassPre, admixedClass, p, subjects[
        -1], snpLocations, snpNames
Пример #5
0
def fstFile(file1, file2, isNorm=True):
    """Calculates fst based on data stored in two tab delimited files.
    Each file must contain the following columns: snp names; snp position; snp values. """
    import fileReader
    files=fileReader.concurrentFileReader(file1, file2, key=1)
    subjects=files.next()[0]
    vals=[]
    for ([snpName, snpLocation], snps) in files:
        vals.append(nucleotides2Haplotypes(sum(snps, [])))
    vals=np.asarray(vals, np.float)
    nSamples=len(subjects[0])
    return fst(vals[:,:nSamples], vals[:,nSamples:], isNorm)
Пример #6
0
def fstFile(file1, file2, isNorm=True):
    """Calculates fst based on data stored in two tab delimited files.
    Each file must contain the following columns: snp names; snp position; snp values. """
    import fileReader
    files=fileReader.concurrentFileReader(file1, file2, key=1)
    subjects=files.next()[0]
    vals=[]
    for ([snpName, snpLocation], snps) in files:
        vals.append(nucleotides2Haplotypes(sum(snps, [])))
    vals=np.asarray(vals, np.float)
    nSamples=len(subjects[0])
    return fst(vals[:,:nSamples], vals[:,nSamples:], isNorm)
Пример #7
0
def readFiles(files, fileType='beagle', chrom=None):
    nFiles = len(files)
    if fileType == 'beagle':
        files = fileReader.concurrentFileReader(*files)
        subjects = files.next()[0]
    elif fileType == 'tped':
        tfams = [f.replace('.tped', '.tfam') for f in files]
        tfams = [fileReader.openfile(f) for f in tfams]
        subjects = []
        for f in tfams:
            subs = [[l.split(None, 1)[0] + '_a',
                     l.split(None, 1)[0] + '_b'] for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files = fileReader.concurrentFileReader(*files,
                                                nHeaders=0,
                                                key=[0, 1],
                                                nLabels=4)
    else:
        sys.stderr.write('ERROR: Filetype has to be either beagle or tped')
        sys.exit()
    snpNames = []
    snpPos = []
    pops = [[] for i in range(nFiles)]
    for s, l in files:
        if fileType == 'tped':
            if chrom != None and chrom != s[0]:
                continue
            s = [s[1], s[3]]
        snpNames.append(s[0])
        snpPos.append(int(s[1]))
        for i in range(nFiles):
            pops[i].append(l[i])
    nSNPs = len(snpNames)
    pops = map(np.asarray, pops)
    nPops = [l.shape[1] for l in pops]
    return pops, nPops, subjects, nSNPs, snpPos, snpNames
Пример #8
0
def classify(fileNames, smoother, classifier=regionClassifier.SVMpymvpa(C), win_size=100, CHR='chr1', mapFile='data/hapmap2/genetic_map_%(CHR)s_b36.txt'):
    """Deconvolves ancestry in last file based on ancestral
    populations in first files.
    Arguments:
    - `fileNames`: list of fileNames
    - `win_size`: number of snps in each window (defualt=100)
    Returns:
    - `ancestralSuccess`: success of cross validation in ancestral populations
    - `admixedClassPre`: classification of admixed samples before hmm filter
    - `admixedClass`:    classification of admixed samples after hmm filter
    """
    snpLocations=[]     #stores physical location from files
    snpNames=[]     #stores physical location from files
    ancestralSuccess=[] #stores success of ancestral classification
    admixedClass=[]     #stores classification of test Subjects
    files=fileReader.concurrentFileReader(*fileNames, key=1)

    subjects=files.next()[0]
    nTrain=np.sum(map(len, subjects[:-1]))  #Number of samples in training set
    nTest=len(subjects[-1]);
    labelsTrain =sum([[i]*len(sub) for i, sub in enumerate(subjects[:-1])],[])
    vals=np.zeros((nTrain+nTest, win_size))  #temporary storage of output
    while True: 
        rsIds=[]
        pos=[]
        for i, ([snpName, snpLocation], snps) in enumerate(files):
            pos.append(float(snpLocation))
            rsIds.append(snpName)
            vals[:,i] = fileReader.nucleotides2Haplotypes(sum(snps, []))
            if i==vals.shape[1]-1:
                break
        snpLocations.append(pos)
        snpNames.append(rsIds)
        #print  len(snpLocations), snpLocations[-1][0], '->', snpLocations[-1][-1], snpNames[-1][0], '->', snpNames[-1][-1]
        ancestral, admixed=classifier(vals[:nTrain,:i+1], labelsTrain, vals[-nTest:, :i+1])
        ancestralSuccess.append(ancestral)
        admixedClass.append(admixed)
        if i<win_size-1:
            break
    admixedClassPre=np.array(admixedClass)

    #Figure out mapLocations
    gm=popgen.geneticMap(mapFile%locals())
    mapLocations=gm.pos2gm(np.hstack(snpLocations))
    admixedClass, p=smoother(mapLocations, ancestralSuccess, admixedClassPre)
    return  admixedClassPre, admixedClass, p, subjects[-1], snpLocations, snpNames
Пример #9
0
import pylab, fileReader, sys, numpy as np
from scipy.linalg import svd

files=fileReader.concurrentFileReader(*sys.argv[1:])

subjects=files.next()[0]
snpLabels=[]        #stores snp labels from in files
snpLocations=[]     #stores physical location from files
snpVals=[]
for i, (snpInfo, snps) in enumerate(files):
    snpLabels.append(snpInfo[0])
    snpLocations.append(float(snpInfo[1]))
    snpVals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))

snps=np.asarray(snpVals)

[u,s,vt]=svd(snps,0)

nPops=map(len, subjects)
colors=pylab.cm.copper(np.linspace(0,1,len(subjects)))
colors[-1,:]=[1,0,0,1]

idx0=0
for i, sub in enumerate(subjects):
    idx1=len(sub)+idx0
    pylab.plot(vt[1,idx0:idx1], vt[2,idx0:idx1], '.', markersize=10, color=colors[i,:]) 
    idx0=idx1
pylab.xlabel('PC1')
pylab.ylabel('PC2')
pylab.legend([l.split('.')[0] for l in  sys.argv[1:]])
pylab.show()
Пример #10
0
#Create dictionary from name to population
popDict=dict([(l.split(',')[2], l.split(',')[7]) for l in open(POPLABEL_FILE)])
for ind, pop in (l.strip().split('\t')[1:] for l in open(QATARLABEL_FILE) ):
    popDict[ind]=pop
popDict.pop('Sample ID')

###############################################
#  Perform PCA
###############################################
#Read monlithic file containing all populations 
snpLabels=[]        #stores snp labels from in files
snpLocations=[]     #stores physical location from files
snpVals=[]
for i in range(1,23):
    CHR='chr%i' %i
    files=fileReader.concurrentFileReader(HGDP_FILE%locals(), QATARFILES%locals())
    subjects=files.next()[0]
    for i, ([snpName, snpLocation], snps) in enumerate(files):
        snpLabels.append(snpName)
        snpLocations.append(float(snpLocation))
        snpVals.append(fileReader.nucleotides2SNPs(sum(snps, [])))
subjects=np.hstack(subjects)
snpVals=np.asarray(snpVals)
popLabels=[popDict.get(s[:-2]) for s in subjects[::2]]
nSNPs, nSamples=snpVals.shape
#Normalize markers
snpVals=(snpVals-np.tile(snpVals.mean(1), (nSamples,1)).T)   #Mean center results
for i in range(nSNPs): snpVals[i,:]=snpVals[i,:]/np.sqrt(np.dot(snpVals[i,:],snpVals[i,:])) #Variance Scale

#Compute SVD and store results
U, S, Vt=svd(snpVals, 0)
Пример #11
0
import pylab, fileReader, sys, numpy as np
from scipy.linalg import svd

files = fileReader.concurrentFileReader(*sys.argv[1:])

subjects = files.next()[0]
snpLabels = []  #stores snp labels from in files
snpLocations = []  #stores physical location from files
snpVals = []
for i, (snpInfo, snps) in enumerate(files):
    snpLabels.append(snpInfo[0])
    snpLocations.append(float(snpInfo[1]))
    snpVals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))

snps = np.asarray(snpVals)

[u, s, vt] = svd(snps, 0)

nPops = map(len, subjects)
colors = pylab.cm.copper(np.linspace(0, 1, len(subjects)))
colors[-1, :] = [1, 0, 0, 1]

idx0 = 0
for i, sub in enumerate(subjects):
    idx1 = len(sub) + idx0
    pylab.plot(vt[1, idx0:idx1],
               vt[2, idx0:idx1],
               '.',
               markersize=10,
               color=colors[i, :])
    idx0 = idx1