def checkRef(name): reffile = name +'RefT' altfile = name + 'AltT' hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19(download = True) print "Loaded hg19" ref = glob.json(reffile) print "Loaded Ref" alt = glob.json(altfile) print "Loaded Alt" flip = [] errors = [] keys = ref.keys() for snppos in keys: print snppos + '\t' + name t = snppos.split('pos') hg19snp = str(hg19[t[0]][int(t[1])-1]).upper() refsnp = ref[snppos].upper() altsnp = alt[snppos].upper() if hg19snp == refsnp: continue elif hg19snp == altsnp: flip.append(snppos) else: print "Error: Neither Ref nor Alt of SNP corresponds to hg19 sequence" errors.append(snppos) glob.dump(flip, name+'flips') glob.dump(errors, name+'errors') return [flip, errors]
def main(argv): opts, args = getopt.getopt(argv,"a:ghc:i",["report=", "genonames="]) for opt, arg in opts: if opt == '-a': report = arg print "processing array, getting snps {0}".format(report) getarraysnps(arg) [f, e] = parsegenotypes.checkRef(report) parsegenotypes.flipArray(report, f) parsegenotypes.filterzeros(report) parsegenotypes.printtabarray(report) elif opt == '-g': processgenotypes() elif opt == '-h': processhapmap() elif opt == '-c': snps = glob.json('Array251Msnps') if arg == '': genofiles = names1KG else: genofiles = arg combinegenos(genofiles, snps) elif opt == "-i": genofiles = names.append('hapmap')
def getpoollines(genofile, pool, out = "poolgenotype"): output = open(out, 'w') pool = glob.json(pool) g = open(genofile) lines = g.readlines() g.close() linenames = lines[0] ln = linenames.split('\t')[1].split(',') ln = map(lambda x: x.strip('\n'), ln) print pool poolinds = map(lambda x: ln.index(x), pool) nl = [ln[i] for i in poolinds] newlinenames = ',' newlinenames = reduce(lambda x,y: x + ',' + y, nl) output.write(newlinenames + '\n') for l in lines[1:]: t = l.split('\t') gs = t[1].split(',') gs = map(lambda x: x.strip('\n'), gs) if len(filter(lambda x: int(x)!=0, gs)) >0: newl = t[0] + ',' newg = [] for i in range(0, len(gs)): if i in poolinds: newg.append(gs[i]) newg = reduce(lambda x,y: x + ',' + y, newg) newl = newl + newg + '\n' output.write(newl)
def printtabarray(arrayname): """output will be analyzed by R to find cell line frequencies """ output = open(arrayname+'Rinput', 'w') freq = glob.json(arrayname+'freq') for snp in freq.keys(): output.write(snp + '\t') output.write(str(freq[snp]) + '\n')
def filterzeros(arrayname): """take out those that are 0 """ freq = glob.json(arrayname+'freq') for snp in freq.keys(): if freq[snp] == 0 or math.isnan(freq[snp]): del freq[snp] glob.dump(freq, arrayname+'freq')
def parsehapmapchrom(chrom): """ inputs: rsid hash (hash of snp positions and rsIDs) hapmapchrN downloaded from hapmap3 site people: list of cell line IDs that we are looking for """ hapmapfile = '../genotypes/hapmapchr'+str(chrom) if 'rsid2poshash' not in os.listdir('./'): makerhash() else: rshash = glob.json('rsid2poshash') people = ['NA19140','NA19154','NA19173','NA19203','NA19206','NA19211','NA19222'] with open(hapmapfile) as f: lines = f.readlines() header = lines[0].split(' ') rsidi = header.index('rs#') snpi = header.index('alleles') peoplewithgenos = filter(lambda x: x in header, people) print peoplewithgenos peoplei = map(lambda x: header.index(x), peoplewithgenos) refhash = {} althash = {} out = open(hapmapfile+'genotype','w') for l in lines[1:]: t = l.split(' ') rsid = t[rsidi] snps = t[snpi].split('/') ref = snps[0].upper() alt = snps[1].upper() genotypes = map(lambda x: t[x], peoplei) genocount = map(lambda x: len(filter(lambda y: alt == y, x.upper())), genotypes) try: snppos = rshash[rsid] newline = snppos + '\t' newline = reduce(lambda x,y: x+str(y) + ',', [newline] + genocount) out.write(newline+'\n') refhash[snppos] = ref althash[snppos] = alt except KeyError: pass out.close() return [refhash, althash]
def flipArray(arrayname, flip, error): """flip array snp frequencies (hash) 1-freq for those in snp list inputed as flip input is constructed in the original getarraysnps() function """ try: arrayfreq = glob.json(arrayname+'freq') except: "No array snp frequency file" for snp in flip: arrayfreq[snp] = 1 - arrayfreq[snp] for snp in error: del arrayfreq[snp] glob.dump(arrayfreq, arrayname+'freq')
def filterSNPs(name): reffile = name +'Ref' altfile = name + 'Alt' [ref, alt] = map(lambda x: glob.json(x, ''), [reffile, altfile]) print "Loaded Ref {0}, Alt {1}".format(len(ref),len(alt)) keys = ref.keys() complsnps = [] for snppos in keys: if glob.compl[ref[snppos].upper()] == alt[snppos].upper() or ref[snppos].upper() == alt[snppos].upper(): complsnps.append(snppos) del ref[snppos] del alt[snppos] print len(ref) print len(alt) glob.dump(ref, reffile+'T') glob.dump(alt, altfile+'T') return complsnps
if args.a: print "processing array, getting snps {0}".format(args.a) report = args.a getarraysnps(report) [f, e] = parsegenotypes.checkRef(report) parsegenotypes.flipArray(report, f, e) parsegenotypes.filterzeros(report) parsegenotypes.printtabarray(report) if args.g: processgenotypes() if args.hapmap: processhapmap() if args.init1KG: #snps = glob.json('25M1.1snps') snps = glob.json('MKReportbySNP1.txtsnps') #combinegenos(names1KG, snps, 'Genos1kgArray25M') combinegenos(names1KG, snps, 'Genos1kgArrayOmni') if args.inithapmap: snps = glob.json('Array25M1snps') combinegenos('hapmap', snps, 'hapmapGenosArray25M', 1) if args.pool: #getpoollines('intercomb','pool1', 'pool1genotype') # new arrays #getpoollines('Genos1kgArray25M', 'pool1', 'pool1genotype') #old arrays getpoollines('Genos1kgArrayOmni', 'pool1', 'pool1genotypeOmni')