def makeclust(ID, datatype, WORK): " load tier 2 hits (names,direction) into a Dic with seeds as keys" Uin = open(WORK+"prefix/cat.u") Fseeds = {} for line in [line.split("\t") for line in Uin.readlines()]: if line[1] not in Fseeds: Fseeds[line[1]] = [(line[0],line[4])] else: Fseeds[line[1]].append((line[0],line[4])) Uin.close() " load tier 1 hits (names,direction) into a Dictionary with seeds as keys" FS = glob.glob(WORK+"prefix/cat.u_*") Useeds = {} for f in FS: infile = open(f) for line in [line.split("\t") for line in infile.readlines()]: if line[1] not in Useeds: Useeds[line[1]] = [(line[0],line[4])] else: Useeds[line[1]].append((line[0],line[4])) infile.close() " Make one dictionary with combining Fseeds and Useeds matching to Fseeds" D = {} for seed in Fseeds: # add matches to seed to D[seed] Fhits = Useeds.get(seed) # add matches to hits to seed to D[seed] Mhits = [] for hit in Fseeds[seed]: Mhits.append(hit) ugh = Useeds.get(hit[0]) if ugh: if hit[1] == "-": if len(ugh) == 1: Mhits += [(ugh[0][0],flip(ugh[0][1]))] elif len(ugh) > 1: for child in ugh: Mhits += [(child[0], flip(child[1]))] else: Mhits += ugh if Fhits: D[(seed,'s')] = Fhits+Mhits else: D[(seed,'s')] = Mhits " load seeds of tier 2 into D and set its Useed hits" f = open(WORK+"prefix/cat._tempU") lines = f.readlines() for line in lines: if ">" in line: if (line.strip()[1:],'s') not in D: if Useeds.get(line.strip()[1:]): D[(line.strip()[1:],'s')] = Useeds.get(line.strip()[1:]) f.close() " load .consens files into Dics " FS = glob.glob(WORK+"clust"+ID+"/cat.consens_*.gz") Seqs = {} for f in FS: with gzip.open(f) as ff: k = itertools.izip(*[iter(ff)]*2) while 1: try: a = k.next() except StopIteration: break Seqs[a[0].strip()] = a[1].strip() " write clust file " outfile = gzip.open(WORK+"prefix/cat.clust_.gz", 'w') for i in D: thisclust = [] outfile.write(">"+i[0]+'\n'+Seqs[">"+i[0]].upper()+'\n') thisclust.append(">"+i[0]+'\n'+Seqs[">"+i[0]].upper()) for m in D[i]: if ">"+m[0]+'\n'+Seqs[">"+m[0]].upper() not in thisclust: if m[1] == "-": outfile.write(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1]+'\n') thisclust.append(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1]) else: outfile.write(">"+m[0]+'\n'+Seqs[">"+m[0]].upper()+'\n') thisclust.append(">"+m[0]+'\n'+Seqs[">"+m[0]].upper()) outfile.write("//\n") outfile.close()
def makeclust(ID, datatype, WORK): " load tier 2 hits (names,direction) into a Dic with seeds as keys" Uin = open(WORK + "prefix/cat.u") Fseeds = {} for line in [line.split("\t") for line in Uin.readlines()]: if line[1] not in Fseeds: Fseeds[line[1]] = [(line[0], line[4])] else: Fseeds[line[1]].append((line[0], line[4])) Uin.close() " load tier 1 hits (names,direction) into a Dictionary with seeds as keys" FS = glob.glob(WORK + "prefix/cat.u_*") Useeds = {} for f in FS: infile = open(f) for line in [line.split("\t") for line in infile.readlines()]: if line[1] not in Useeds: Useeds[line[1]] = [(line[0], line[4])] else: Useeds[line[1]].append((line[0], line[4])) infile.close() " Make one dictionary with combining Fseeds and Useeds matching to Fseeds" D = {} for seed in Fseeds: # add matches to seed to D[seed] Fhits = Useeds.get(seed) # add matches to hits to seed to D[seed] Mhits = [] for hit in Fseeds[seed]: Mhits.append(hit) ugh = Useeds.get(hit[0]) if ugh: if hit[1] == "-": if len(ugh) == 1: Mhits += [(ugh[0][0], flip(ugh[0][1]))] elif len(ugh) > 1: for child in ugh: Mhits += [(child[0], flip(child[1]))] else: Mhits += ugh if Fhits: D[(seed, 's')] = Fhits + Mhits else: D[(seed, 's')] = Mhits " load seeds of tier 2 into D and set its Useed hits" f = open(WORK + "prefix/cat._tempU") lines = f.readlines() for line in lines: if ">" in line: if (line.strip()[1:], 's') not in D: if Useeds.get(line.strip()[1:]): D[(line.strip()[1:], 's')] = Useeds.get(line.strip()[1:]) f.close() " load .consens files into Dics " FS = glob.glob(WORK + "clust" + ID + "/cat.consens_*.gz") Seqs = {} for f in FS: with gzip.open(f) as ff: k = itertools.izip(*[iter(ff)] * 2) while 1: try: a = k.next() except StopIteration: break Seqs[a[0].strip()] = a[1].strip() " write clust file " outfile = gzip.open(WORK + "prefix/cat.clust_.gz", 'w') for i in D: thisclust = [] outfile.write(">" + i[0] + '\n' + Seqs[">" + i[0]].upper() + '\n') thisclust.append(">" + i[0] + '\n' + Seqs[">" + i[0]].upper()) for m in D[i]: if ">" + m[0] + '\n' + Seqs[">" + m[0]].upper() not in thisclust: if m[1] == "-": outfile.write(">" + m[0] + '\n' + comp(Seqs[">" + m[0]].upper())[::-1] + '\n') thisclust.append(">" + m[0] + '\n' + comp(Seqs[">" + m[0]].upper())[::-1]) else: outfile.write(">" + m[0] + '\n' + Seqs[">" + m[0]].upper() + '\n') thisclust.append(">" + m[0] + '\n' + Seqs[">" + m[0]].upper()) outfile.write("//\n") outfile.close()
def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype): """ three functions: (1) replaces low quality base calls with Ns, (2) checks for adapter sequence if strict set to 1 or 2 """ if "," in CUT: CUT1, CUT2 = CUT.split(',') else: CUT1 = CUT2 = CUT if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile, 'r') n = str(infile.split('/')[-1]).replace("_R1.", ".") while n.split(".")[-1] in ["fastq", "fastQ", "gz", "fq", "FastQ"]: n = n.replace('.' + n.split(".")[-1], "") k = itertools.izip(*[iter(f)] * 4) writing_r = [] writing_c = [] orig = keep = keepcut = 0 handle = WORK + 'edits/' + str(n) + ".edit" while 1: try: d = k.next() except StopIteration: break orig += 1 SS = d[1].strip() ph = map(ord, d[3].strip('\n')) offset = int(Q) phred = map(lambda x: x - offset, ph) seq = ["N"] * len(phred) for base in range(len(phred)): if base >= len(CUT1): ## don't quality check cut site if phred[base] >= 20: ## quality threshold try: seq[base] = SS[base] except IndexError: None else: seq[base] = "N" else: if unambar(CUT1): seq[base] = unambar(CUT1)[0][base] else: seq[base] = CUT1[base] #try: seq[base] = SS[base] #except IndexError: # None if not orig % 5000: if trimkeep: " write full length and fragment reads " with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) outfile.write("".join([z for z in writing_c])) else: " write only full length reads " with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] writing_c = [] s = "".join(seq) wheretocut1 = None if strict: wheretocut1 = Afilter(comp(CUT2)[::-1], s, strict) s = s[:wheretocut1] if datatype == 'merged': " remove extra forward base so forwards match reverse length" s = s[:-1] if s.count("N") <= pN: ## max allowed Ns if len(s) >= max( 32, trimkeep): ## if read is trimmed, must be minlen long if wheretocut1: ## if it was trimmed... writing_c.append(">" + n + "_" + str(keepcut) + "_c1" + "\n" + s + "\n") keepcut += 1 else: writing_r.append(">" + n + "_" + str(keep) + "_r1" + "\n" + s + "\n") keep += 1 if trimkeep: with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) outfile.write("".join([z for z in writing_c])) else: with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] writing_c = [] f.close() sys.stderr.write(".") if not trimkeep: keepcut = 0 return [ handle.split("/")[-1].replace(".edit", ""), str(orig), str(keep), str(keepcut) ]
def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype): """ three functions: (1) replaces low quality base calls with Ns, (2) checks for adapter sequence if strict set to 1 or 2 """ if "," in CUT: CUT1,CUT2 = CUT.split(',') else: CUT1=CUT2=CUT if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile,'r') n = str(infile.split('/')[-1]).replace("_R1.",".") while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: n = n.replace('.'+n.split(".")[-1], "") k = itertools.izip(*[iter(f)]*4) writing_r = [] writing_c = [] orig = keep = keepcut = 0 handle = WORK+'edits/'+str(n)+".edit" while 1: try: d = k.next() except StopIteration: break orig += 1 SS = d[1].strip() ph = map(ord,d[3].strip('\n')) offset = int(Q) phred = map(lambda x:x-offset,ph) seq = ["N"]*len(phred) for base in range(len(phred)): if base >= len(CUT1): ## don't quality check cut site if phred[base] >= 20: ## quality threshold try: seq[base] = SS[base] except IndexError: None else: seq[base] = "N" else: if unambar(CUT1): seq[base] = unambar(CUT1)[0][base] else: seq[base] = CUT1[base] #try: seq[base] = SS[base] #except IndexError: # None if not orig % 5000: if trimkeep: " write full length and fragment reads " with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) outfile.write("".join([z for z in writing_c])) else: " write only full length reads " with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] writing_c = [] s = "".join(seq) wheretocut1 = None if strict: wheretocut1 = Afilter(comp(CUT2)[::-1],s,strict) s = s[:wheretocut1] if datatype == 'merged': " remove extra forward base so forwards match reverse length" s = s[:-1] if s.count("N") <= pN: ## max allowed Ns if len(s) >= max(32,trimkeep): ## if read is trimmed, must be minlen long if wheretocut1: ## if it was trimmed... writing_c.append(">"+n+"_"+str(keepcut)+"_c1"+"\n"+s+"\n") keepcut += 1 else: writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n") keep += 1 if trimkeep: with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) outfile.write("".join([z for z in writing_c])) else: with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] writing_c = [] f.close() sys.stderr.write(".") if not trimkeep: keepcut = 0 return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)]
def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q): """ three functions: (1) replaces low quality base calls with Ns, (2) checks for adapter sequence if strict set to 1 or 2 """ if "," in CUT: CUT1, CUT2 = CUT.split(',') else: CUT1 = CUT2 = CUT if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile, 'r') " remove name suffix" n = str(infile.split('/')[-1]).replace("_R1.", ".") while n.split(".")[-1] in ["fastq", "fastQ", "gz", "fq", "FastQ"]: n = n.replace('.' + n.split(".")[-1], "") " read infile 4 lines at a time, setup counters and lists" k = itertools.izip(*[iter(f)] * 4) writing_r = [] writing_c = [] orig = keep = keepcut = 0 handle = WORK + 'edits/' + str(n) + ".edit" " do a test run on first 1000 reads to find if extra bases on right end of reads" rightend = [] while len(rightend) < 1000: try: d = k.next() except StopIteration: break s = "".join(d[1].strip()) " cutters " find1 = CUT1 find2 = comp(CUT2)[::-1] " are cutters found on both ends? A correct merge" a = s[:len(find1)] b = s[-len(find2) - 2:] ## w/ wiggle room if (find1 in a) and (find2 in b): xtra = s.rindex(find2) + len(find2) rightend.append(len(s) - xtra) " find most common element in rightend " if rightend: a = most_common(rightend) if a > 3: Roffset = 0 else: Roffset = a else: Roffset = 0 " reset iterable " if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile, 'r') k = itertools.izip(*[iter(f)] * 4) " iterate over each read " while 1: try: d = k.next() except StopIteration: break orig += 1 SS = d[1].strip() " apply Phred Q filter " ph = map(ord, d[3].strip('\n')) offset = int(Q) phred = map(lambda x: x - offset, ph) seq = ["N"] * len(phred) for base in range(len(phred)): "don't quality check cut sites " if (base >= len(CUT1)) and (base < len(phred) - len(CUT2)): if phred[base] >= 20: try: seq[base] = SS[base] except IndexError: None else: seq[base] = "N" else: try: seq[base] = SS[base] except IndexError: None " write to file " if not orig % 5000: with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] s = "".join(seq) wheretocut = [None, None, None] " filter for N" if s.count("N") <= pN: " apply filter for Adapters " find1 = CUT1 find2 = comp(CUT2)[::-1] if "trim" in d[0]: " filters for non-merged, trimmed reads from s2 " if (find1 in s[:len(find1)]) or (find2 in s[len(find2) - 2:]): None else: " CUT1 rarely missing, CUT2 sometimes missing" s = s[:-len(CUT2) - Roffset] else: " merged reads. Are cutters found on both ends? A correct merge" a = s[:len(find1)] b = s[-len(find2) - 2:] ## w/ wiggle room if (find1 in a) and (find2 in b): " find end of read2 " xtra = s.rindex(find2) + len(find2) wheretocut = [None, len(s) - Roffset, 'complete'] else: " look for CUT2 from right side " if find2 in s[ len(s) / 2:]: ## check that this is a good general number... a = s.rindex(find2) + len(find2) wheretocut = [None, a, 'find2 in s'] else: "couldn't find cut2, maybe has error, look for adapter" if 'AGATCG' in s: a = s.rindex('AGATCG') - len(CUT2) wheretocut = [None, a, 'AGATCG in s'] else: if "TCGGAAG" in s: a = s.rindex('TCGGAAG') - len(CUT2) - 3 wheretocut = [None, a, 'TCGGAAG in s'] else: "no sign of overshoot to right --->" " look for overshoot on left <---- " wheretocut = [None, len(s) - Roffset, "None"] " look for CUT1 from left side " if CUT1 in s: a = s.index(CUT1) wheretocut[0] = a else: "exclude read" wheretocut[0] = wheretocut[1] w1, w2, reason = wheretocut if len(s[w1:w2]) > trimkeep: #print s[w1:w2], reason, len(s[w1:w2]), trimkeep s = s[w1:w2] else: s = "" if len(s) >= max( 36, trimkeep): ## if read is trimmed, must be minlen long writing_r.append(">" + n + "_" + str(keep) + "_r1" + "\n" + s + "\n") keep += 1 with open(WORK + 'edits/' + str(n) + ".edit", 'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] f.close() sys.stderr.write(".") if not trimkeep: keepcut = 0 return [ handle.split("/")[-1].replace(".edit", ""), str(orig), str(keep), str(keepcut) ]
def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q): """ three functions: (1) replaces low quality base calls with Ns, (2) checks for adapter sequence if strict set to 1 or 2 """ if "," in CUT: CUT1,CUT2 = CUT.split(',') else: CUT1=CUT2=CUT if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile,'r') " remove name suffix" n = str(infile.split('/')[-1]).replace("_R1.",".") while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: n = n.replace('.'+n.split(".")[-1], "") " read infile 4 lines at a time, setup counters and lists" k = itertools.izip(*[iter(f)]*4) writing_r = [] writing_c = [] orig = keep = keepcut = 0 handle = WORK+'edits/'+str(n)+".edit" " do a test run on first 1000 reads to find if extra bases on right end of reads" rightend = [] while len(rightend) < 1000: try: d = k.next() except StopIteration: break s = "".join(d[1].strip()) " cutters " find1 = CUT1 find2 = comp(CUT2)[::-1] " are cutters found on both ends? A correct merge" a = s[:len(find1)] b = s[-len(find2)-2:] ## w/ wiggle room if (find1 in a) and (find2 in b) : xtra = s.rindex(find2)+len(find2) rightend.append(len(s)-xtra) " find most common element in rightend " if rightend: a = most_common(rightend) if a>3: Roffset = 0 else: Roffset = a else: Roffset = 0 " reset iterable " if ".gz" in infile: f = gzip.open(infile, 'r') else: f = open(infile,'r') k = itertools.izip(*[iter(f)]*4) " iterate over each read " while 1: try: d = k.next() except StopIteration: break orig += 1 SS = d[1].strip() " apply Phred Q filter " ph = map(ord,d[3].strip('\n')) offset = int(Q) phred = map(lambda x:x-offset,ph) seq = ["N"]*len(phred) for base in range(len(phred)): "don't quality check cut sites " if (base >= len(CUT1)) and (base < len(phred)-len(CUT2)): if phred[base] >= 20: try: seq[base] = SS[base] except IndexError: None else: seq[base] = "N" else: try: seq[base] = SS[base] except IndexError: None " write to file " if not orig % 5000: with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] s = "".join(seq) wheretocut = [None,None,None] " filter for N" if s.count("N") <= pN: " apply filter for Adapters " find1 = CUT1 find2 = comp(CUT2)[::-1] if "trim" in d[0]: " filters for non-merged, trimmed reads from s2 " if (find1 in s[:len(find1)]) or (find2 in s[len(find2)-2:]): None else: " CUT1 rarely missing, CUT2 sometimes missing" s = s[:-len(CUT2)-Roffset] else: " merged reads. Are cutters found on both ends? A correct merge" a = s[:len(find1)] b = s[-len(find2)-2:] ## w/ wiggle room if (find1 in a) and (find2 in b) : " find end of read2 " xtra = s.rindex(find2)+len(find2) wheretocut = [None, len(s)-Roffset, 'complete'] else: " look for CUT2 from right side " if find2 in s[len(s)/2:]: ## check that this is a good general number... a = s.rindex(find2)+len(find2) wheretocut = [None, a, 'find2 in s'] else: "couldn't find cut2, maybe has error, look for adapter" if 'AGATCG' in s: a = s.rindex('AGATCG')-len(CUT2) wheretocut = [None, a, 'AGATCG in s'] else: if "TCGGAAG" in s: a = s.rindex('TCGGAAG')-len(CUT2)-3 wheretocut = [None, a, 'TCGGAAG in s'] else: "no sign of overshoot to right --->" " look for overshoot on left <---- " wheretocut = [None, len(s)-Roffset, "None"] " look for CUT1 from left side " if CUT1 in s: a = s.index(CUT1) wheretocut[0] = a else: "exclude read" wheretocut[0] = wheretocut[1] w1,w2,reason = wheretocut if len(s[w1:w2]) > trimkeep: #print s[w1:w2], reason, len(s[w1:w2]), trimkeep s = s[w1:w2] else: s = "" if len(s) >= max(36,trimkeep): ## if read is trimmed, must be minlen long writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n") keep += 1 with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: outfile.write("".join([z for z in writing_r])) writing_r = [] f.close() sys.stderr.write(".") if not trimkeep: keepcut = 0 return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)]