def readFiles(files, fileType='beagle', chrom=None): nFiles=len(files) if fileType=='beagle': files=fileReader.concurrentFileReader(*files) subjects=files.next()[0] elif fileType=='tped': tfams=[f.replace('.tped', '.tfam') for f in files] tfams=[fileReader.openfile(f) for f in tfams] subjects=[] for f in tfams: subs=[[l.split(None, 1)[0]+'_a',l.split(None, 1)[0]+'_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files=fileReader.concurrentFileReader(*files, nHeaders=0, key=[0,1], nLabels=4) else: sys.stderr.write('ERROR: Filetype has to be either beagle or tped') sys.exit() snpNames=[]; snpPos=[]; pops=[[] for i in range(nFiles)] for s, l in files: if fileType=='tped': if chrom!=None and chrom!=s[0]: continue s=[s[1], s[3]] snpNames.append(s[0]) snpPos.append(int(s[1])) for i in range(nFiles): pops[i].append(l[i]) nSNPs=len(snpNames) pops=map(np.asarray, pops) nPops=[l.shape[1] for l in pops] return pops, nPops, subjects, nSNPs, snpPos, snpNames
def readFiles(fileNames, isBeagle=True): snpNames = [] snpLocations = [] #stores physical location from files vals = [] #Stores Values of genotypes if isBeagle: files = fileReader.concurrentFileReader(*fileNames, key=0) subjects = files.next()[0] else: tfams = [f.replace('.tped', '.tfam') for f in fileNames] tfams = [fileReader.openfile(f) for f in tfams] subjects = [] for f in tfams: subs = [[l.split(None, 2)[1] + '_a', l.split(None, 2)[1] + '_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files = fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0, 1], nLabels=4) labels = np.asarray( sum([[i] * len(sub) for i, sub in enumerate(subjects)], [])) for i, (snpInfo, snps) in enumerate(files): if isBeagle: snpLocations.append(float(snpInfo[1])) snpNames.append(snpInfo[0]) else: snpLocations.append(float(snpInfo[3])) snpNames.append(snpInfo[1]) vals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) vals = np.asarray(vals).T snpLocations = np.asarray(snpLocations) return subjects, labels, snpNames, snpLocations, vals
def readFiles(fileNames, isBeagle=True): snpNames=[] snpLocations=[] #stores physical location from files vals=[] #Stores Values of genotypes if isBeagle: files=fileReader.concurrentFileReader(*fileNames, key=0) subjects=files.next()[0] else: tfams=[f.replace('.tped', '.tfam') for f in fileNames] tfams=[fileReader.openfile(f) for f in tfams] subjects=[] for f in tfams: subs=[[l.split(None, 2)[1]+'_a',l.split(None, 2)[1]+'_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files=fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0,1], nLabels=4) labels=np.asarray(sum([[i]*len(sub) for i, sub in enumerate(subjects)], [])) for i, (snpInfo,snps) in enumerate(files): if isBeagle: snpLocations.append(float(snpInfo[1])) snpNames.append(snpInfo[0]) else: snpLocations.append(float(snpInfo[3])) snpNames.append(snpInfo[1]) vals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) vals=np.asarray(vals).T snpLocations=np.asarray(snpLocations) return subjects, labels, snpNames, snpLocations, vals
def classify(fileNames, smoother, classifier=regionClassifier.SVMpymvpa(C), win_size=100, CHR='chr1', mapFile='data/hapmap2/genetic_map_%(CHR)s_b36.txt'): """Deconvolves ancestry in last file based on ancestral populations in first files. Arguments: - `fileNames`: list of fileNames - `win_size`: number of snps in each window (defualt=100) Returns: - `ancestralSuccess`: success of cross validation in ancestral populations - `admixedClassPre`: classification of admixed samples before hmm filter - `admixedClass`: classification of admixed samples after hmm filter """ snpLocations = [] #stores physical location from files snpNames = [] #stores physical location from files ancestralSuccess = [] #stores success of ancestral classification admixedClass = [] #stores classification of test Subjects files = fileReader.concurrentFileReader(*fileNames, key=1) subjects = files.next()[0] nTrain = np.sum(map(len, subjects[:-1])) #Number of samples in training set nTest = len(subjects[-1]) labelsTrain = sum([[i] * len(sub) for i, sub in enumerate(subjects[:-1])], []) vals = np.zeros((nTrain + nTest, win_size)) #temporary storage of output while True: rsIds = [] pos = [] for i, ([snpName, snpLocation], snps) in enumerate(files): pos.append(float(snpLocation)) rsIds.append(snpName) vals[:, i] = fileReader.nucleotides2Haplotypes(sum(snps, [])) if i == vals.shape[1] - 1: break snpLocations.append(pos) snpNames.append(rsIds) #print len(snpLocations), snpLocations[-1][0], '->', snpLocations[-1][-1], snpNames[-1][0], '->', snpNames[-1][-1] ancestral, admixed = classifier(vals[:nTrain, :i + 1], labelsTrain, vals[-nTest:, :i + 1]) ancestralSuccess.append(ancestral) admixedClass.append(admixed) if i < win_size - 1: break admixedClassPre = np.array(admixedClass) #Figure out mapLocations gm = popgen.geneticMap(mapFile % locals()) mapLocations = gm.pos2gm(np.hstack(snpLocations)) admixedClass, p = smoother(mapLocations, ancestralSuccess, admixedClassPre) return admixedClassPre, admixedClass, p, subjects[ -1], snpLocations, snpNames
def fstFile(file1, file2, isNorm=True): """Calculates fst based on data stored in two tab delimited files. Each file must contain the following columns: snp names; snp position; snp values. """ import fileReader files=fileReader.concurrentFileReader(file1, file2, key=1) subjects=files.next()[0] vals=[] for ([snpName, snpLocation], snps) in files: vals.append(nucleotides2Haplotypes(sum(snps, []))) vals=np.asarray(vals, np.float) nSamples=len(subjects[0]) return fst(vals[:,:nSamples], vals[:,nSamples:], isNorm)
def readFiles(files, fileType='beagle', chrom=None): nFiles = len(files) if fileType == 'beagle': files = fileReader.concurrentFileReader(*files) subjects = files.next()[0] elif fileType == 'tped': tfams = [f.replace('.tped', '.tfam') for f in files] tfams = [fileReader.openfile(f) for f in tfams] subjects = [] for f in tfams: subs = [[l.split(None, 1)[0] + '_a', l.split(None, 1)[0] + '_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files = fileReader.concurrentFileReader(*files, nHeaders=0, key=[0, 1], nLabels=4) else: sys.stderr.write('ERROR: Filetype has to be either beagle or tped') sys.exit() snpNames = [] snpPos = [] pops = [[] for i in range(nFiles)] for s, l in files: if fileType == 'tped': if chrom != None and chrom != s[0]: continue s = [s[1], s[3]] snpNames.append(s[0]) snpPos.append(int(s[1])) for i in range(nFiles): pops[i].append(l[i]) nSNPs = len(snpNames) pops = map(np.asarray, pops) nPops = [l.shape[1] for l in pops] return pops, nPops, subjects, nSNPs, snpPos, snpNames
def classify(fileNames, smoother, classifier=regionClassifier.SVMpymvpa(C), win_size=100, CHR='chr1', mapFile='data/hapmap2/genetic_map_%(CHR)s_b36.txt'): """Deconvolves ancestry in last file based on ancestral populations in first files. Arguments: - `fileNames`: list of fileNames - `win_size`: number of snps in each window (defualt=100) Returns: - `ancestralSuccess`: success of cross validation in ancestral populations - `admixedClassPre`: classification of admixed samples before hmm filter - `admixedClass`: classification of admixed samples after hmm filter """ snpLocations=[] #stores physical location from files snpNames=[] #stores physical location from files ancestralSuccess=[] #stores success of ancestral classification admixedClass=[] #stores classification of test Subjects files=fileReader.concurrentFileReader(*fileNames, key=1) subjects=files.next()[0] nTrain=np.sum(map(len, subjects[:-1])) #Number of samples in training set nTest=len(subjects[-1]); labelsTrain =sum([[i]*len(sub) for i, sub in enumerate(subjects[:-1])],[]) vals=np.zeros((nTrain+nTest, win_size)) #temporary storage of output while True: rsIds=[] pos=[] for i, ([snpName, snpLocation], snps) in enumerate(files): pos.append(float(snpLocation)) rsIds.append(snpName) vals[:,i] = fileReader.nucleotides2Haplotypes(sum(snps, [])) if i==vals.shape[1]-1: break snpLocations.append(pos) snpNames.append(rsIds) #print len(snpLocations), snpLocations[-1][0], '->', snpLocations[-1][-1], snpNames[-1][0], '->', snpNames[-1][-1] ancestral, admixed=classifier(vals[:nTrain,:i+1], labelsTrain, vals[-nTest:, :i+1]) ancestralSuccess.append(ancestral) admixedClass.append(admixed) if i<win_size-1: break admixedClassPre=np.array(admixedClass) #Figure out mapLocations gm=popgen.geneticMap(mapFile%locals()) mapLocations=gm.pos2gm(np.hstack(snpLocations)) admixedClass, p=smoother(mapLocations, ancestralSuccess, admixedClassPre) return admixedClassPre, admixedClass, p, subjects[-1], snpLocations, snpNames
import pylab, fileReader, sys, numpy as np from scipy.linalg import svd files=fileReader.concurrentFileReader(*sys.argv[1:]) subjects=files.next()[0] snpLabels=[] #stores snp labels from in files snpLocations=[] #stores physical location from files snpVals=[] for i, (snpInfo, snps) in enumerate(files): snpLabels.append(snpInfo[0]) snpLocations.append(float(snpInfo[1])) snpVals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) snps=np.asarray(snpVals) [u,s,vt]=svd(snps,0) nPops=map(len, subjects) colors=pylab.cm.copper(np.linspace(0,1,len(subjects))) colors[-1,:]=[1,0,0,1] idx0=0 for i, sub in enumerate(subjects): idx1=len(sub)+idx0 pylab.plot(vt[1,idx0:idx1], vt[2,idx0:idx1], '.', markersize=10, color=colors[i,:]) idx0=idx1 pylab.xlabel('PC1') pylab.ylabel('PC2') pylab.legend([l.split('.')[0] for l in sys.argv[1:]]) pylab.show()
#Create dictionary from name to population popDict=dict([(l.split(',')[2], l.split(',')[7]) for l in open(POPLABEL_FILE)]) for ind, pop in (l.strip().split('\t')[1:] for l in open(QATARLABEL_FILE) ): popDict[ind]=pop popDict.pop('Sample ID') ############################################### # Perform PCA ############################################### #Read monlithic file containing all populations snpLabels=[] #stores snp labels from in files snpLocations=[] #stores physical location from files snpVals=[] for i in range(1,23): CHR='chr%i' %i files=fileReader.concurrentFileReader(HGDP_FILE%locals(), QATARFILES%locals()) subjects=files.next()[0] for i, ([snpName, snpLocation], snps) in enumerate(files): snpLabels.append(snpName) snpLocations.append(float(snpLocation)) snpVals.append(fileReader.nucleotides2SNPs(sum(snps, []))) subjects=np.hstack(subjects) snpVals=np.asarray(snpVals) popLabels=[popDict.get(s[:-2]) for s in subjects[::2]] nSNPs, nSamples=snpVals.shape #Normalize markers snpVals=(snpVals-np.tile(snpVals.mean(1), (nSamples,1)).T) #Mean center results for i in range(nSNPs): snpVals[i,:]=snpVals[i,:]/np.sqrt(np.dot(snpVals[i,:],snpVals[i,:])) #Variance Scale #Compute SVD and store results U, S, Vt=svd(snpVals, 0)
import pylab, fileReader, sys, numpy as np from scipy.linalg import svd files = fileReader.concurrentFileReader(*sys.argv[1:]) subjects = files.next()[0] snpLabels = [] #stores snp labels from in files snpLocations = [] #stores physical location from files snpVals = [] for i, (snpInfo, snps) in enumerate(files): snpLabels.append(snpInfo[0]) snpLocations.append(float(snpInfo[1])) snpVals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) snps = np.asarray(snpVals) [u, s, vt] = svd(snps, 0) nPops = map(len, subjects) colors = pylab.cm.copper(np.linspace(0, 1, len(subjects))) colors[-1, :] = [1, 0, 0, 1] idx0 = 0 for i, sub in enumerate(subjects): idx1 = len(sub) + idx0 pylab.plot(vt[1, idx0:idx1], vt[2, idx0:idx1], '.', markersize=10, color=colors[i, :]) idx0 = idx1