def __generate_test_data__(self, bam1, sam2, contig_positions, min_mapq): run_results = { "total_queries": 0, "passed_filter": 0, "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), "min_mapq": min_mapq, "query_sam": sam1, "searched_sam": sam2, "contig_positions": contig_positions, "pickle": None, } contig_2_contig_dict = defaultdict(list) # CREATE SAM INDEX IF IT DOESN'T EXIST if not os.path.exists(sam2 + FileIndex.ext): FileIndex.create(sam2, lambda fh: SamLine(fh).qname, allow_multiple=True) fi = FileIndex(sam2, SamLine, allow_multiple=True) ma = MergeAssemblies() contig_starts = [int(l.strip()) for l in open(contig_positions, 'rU')] fq1_out = open("test_fq1.10k.fq", 'w') fq2_out = open("test_fq2.10k.fq", 'w') with open(sam1, 'rU') as qs: for count, q in enumerate(qs): if q.startswith("@"): # skip header lines continue # SPLIT LINE AND IDENTIFY QUERY POSITION. q = q.strip().split("\t") query_pos = int(q[3]) query_pos = ma.round_bp_pos(query_pos) # this could be more sophisticated. q_id = q[0][:-1] + "2" # create query id (e.g., ends with "2") q_mapq = int(q[4]) # and, mapping quality. if count > 10000: break # SEARCH FOR QUERY AND PARSE RESULTS for s in fi[q_id]: fq1 = (q[0], q[9], q[10]) fq2 = (s.qname, s.seq, s.qual) fq1_line = "@{}\n{}\n+\n{}\n".format(*fq1) fq2_line = "@{}\n{}\n+\n{}\n".format(*fq2) fq1_out.write(fq1_line) fq2_out.write(fq2_line)
import os.path as op sys.path.insert(0, op.join(op.dirname(__file__), "..")) from fileindex import FileIndex class FastQEntry(object): __slots__ = ("name", "seq", "l3", "qual", "fpos") def __init__(self, fh): self.name = fh.readline().rstrip("\r\n") self.seq = fh.readline().rstrip("\r\n") self.l3 = fh.readline().rstrip("\r\n") self.qual = fh.readline().rstrip("\r\n") if __name__ == "__main__": f = "/usr/local/src/bowtie/bowtie-0.12.1/work/reads/s_1_sequence.txt" N = 100 # if not op.exists(f + FileIndex.ext): FileIndex.create(f, lambda fh: FastQEntry(fh).name) fi = FileIndex(f, FastQEntry) print "getting %i keys..." % N for i, k in enumerate(fi.db.iterkeys(str)): print fi[k].seq if i == N: break
import sys import os.path as op sys.path.insert(0, op.join(op.dirname(__file__), "..")) from fileindex import FileIndex class SamLine(object): __slots__ = ('name', 'ref_loc', 'ref_seqid') def __init__(self, fh): line = fh.readline().split("\t") or [None] self.name = line[0] self.ref_seqid = line[2] self.ref_loc = int(line[3]) # ... other sam format stuff omitted. if __name__ == "__main__": f = '/usr/local/src/methylcode/emen/en-data/out/methylcoded.sam' if not op.exists(f + FileIndex.ext): FileIndex.create(f, lambda fh: SamLine(fh).name, allow_multiple=True) fi = FileIndex(f, SamLine, allow_multiple=True) print [(s.name, s.ref_seqid, s.ref_loc) for s in fi['23351265']]
def associate_contigs(self, bam1, sam2, min_mapq, run_ID, R1_starts=None): # print bam1, sam2, min_mapq, run_ID # sys.exit() run_results = { "total_queries": 0, "passed_filter": 0, "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), "min_mapq": min_mapq, "query_sam": bam1, "searched_sam": sam2, # "contig_positions": contig_positions, "pickle": None, "failed_filter":0, } contig_2_contig_dict = defaultdict(list) # CREATE SAM INDEX IF IT DOESN'T EXIST if not os.path.exists(sam2 + FileIndex.ext): FileIndex.create(sam2, lambda fh: SamLine(fh).qname, allow_multiple=True) fi = FileIndex(sam2, SamLine, allow_multiple=True) ma = MergeAssemblies() R1_sam = pysam.Samfile(bam1,'rb') path = os.path.split(bam1)[0] if R1_starts == None: R1_starts = os.path.join(path, "{}.R1.contig_start_pos.txt".format(run_ID)) low_depth_R1_starts = os.path.join(path, "{}.R1.contig_start_pos.no_pass.txt".format(run_ID)) low_depth_R1_starts = open(low_depth_R1_starts, 'w') all_R1s = set() # ITERATE OVER START POSITIONS for count, start in enumerate(open(R1_starts,'rU')): query_pos = int(start.strip()) all_R1s.update([query_pos]) if query_pos is 0: continue #print "{}.R1".format(run_ID), query_pos-5, query_pos+5 # samtools view alignments/Trachs_Merged.1.fq.sorted.bam Trachs_Merged_stacks.R1:590-600 reads = R1_sam.fetch("{}.R1".format(run_ID), query_pos-5, query_pos+5) reads = [r for r in reads] # unpack interator depth = len(reads) # if depth < min_depth: # unassociated_R1s.update([query_pos]) for q in reads: q_id = q.qname[:-1] + "2" # create query id (e.g., ends with "2") q_mapq = q.mapq # and, mapping quality. # SEARCH FOR QUERY AND PARSE RESULTS for s in fi[q_id]: if (q_mapq > min_mapq) and (s.mapq > min_mapq): hit_pos = ma.get_hit_pos(s) hit_pos = ma.round_bp_pos(hit_pos) contig_2_contig_dict[query_pos].append(hit_pos) run_results["passed_filter"] = len(contig_2_contig_dict.keys()) unassociated_R1s = all_R1s.difference(set(contig_2_contig_dict.keys())) run_results["failed_filter"] = len(unassociated_R1s) [low_depth_R1_starts.write("{}\n".format(p)) for p in unassociated_R1s] #CREATE PICKLE OUTPUT FILE today = datetime.date.today() path = os.path.split(bam1)[0] pkl_output_file_name = os.path.join(path, '{}.R1_to_R2_contig_associations.pkl'.format(run_ID)) pkl_out = open(pkl_output_file_name, 'wb') pickle.dump(contig_2_contig_dict, pkl_out) low_depth_R1_starts.close() # UPDATE RUN RESULTS AND GENERATE LOGFILE run_results["total_queries"] = count run_results['pickle'] = pkl_output_file_name self.__associate_contigs_log__(run_results, path) # format logging results return 1
import sys import os.path as op sys.path.insert(0, op.join(op.dirname(__file__), "..")) from fileindex import FileIndex class FastQEntry(object): __slots__ = ('name', 'seq', 'l3', 'qual', 'fpos') def __init__(self, fh): self.name = fh.readline().rstrip('\r\n') self.seq = fh.readline().rstrip('\r\n') self.l3 = fh.readline().rstrip('\r\n') self.qual = fh.readline().rstrip('\r\n') if __name__ == "__main__": f = '/usr/local/src/bowtie/bowtie-0.12.1/work/reads/s_1_sequence.txt' N = 100 #if not op.exists(f + FileIndex.ext): FileIndex.create(f, lambda fh: FastQEntry(fh).name) fi = FileIndex(f, FastQEntry) print "getting %i keys..." % N for i, k in enumerate(fi.db.iterkeys(str)): print fi[k].seq if i == N: break