def calc_ontarget_rate(tree, gene_info, input_fasta, sam_filename, output_filename=None): query_len_dict = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta')) if output_filename is None: f = sys.stdout else: f = open(output_filename, 'w') f.write( "read_id\tread_len\tis_fl\tnum_probe\tnum_base_overlap\tloci\tgenes\n") reader = BioReaders.GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict) for r in reader: if r.sID == '*': continue num_probe, base_hit, genes_seen = get_probe_hit(tree, gene_info, r) f.write("{0}\t{1}\tY\t{2}\t{3}\t{4}:{5}-{6}\t{7}\n".format( r.qID, r.qLen, num_probe, base_hit, r.sID, r.sStart, r.sEnd, ",".join(genes_seen))) f.close()
def calc_ontarget_rate(tree, gene_info, input_fasta, is_gtf, sam_or_gtf, output_filename=None): type = 'fasta' if input_fasta.upper().endswith( '.FA') or input_fasta.upper().endswith('.FASTA') else 'fastq' query_len_dict = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), type)) if output_filename is None: f = sys.stdout else: f = open(output_filename, 'w') FIELDS = [ 'read_id', 'read_len', 'num_probe', 'num_base_overlap', 'loci', 'genes' ] writer = DictWriter(f, FIELDS, delimiter='\t') writer.writeheader() if is_gtf: reader = collapseGFFReader(sam_or_gtf) for r in reader: num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { 'read_id': r.seqid, 'read_len': 'NA', 'num_probe': num_probe, 'num_base_overlap': base_hit, 'loci': "{0}:{1}-{2}".format(r.chr, r.start, r.end), 'genes': ",".join(genes_seen) } writer.writerow(rec) else: reader = BioReaders.GMAPSAMReader(sam_or_gtf, True, query_len_dict=query_len_dict) for r in reader: if r.sID == '*': continue num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { 'read_id': r.qID, 'read_len': r.qLen, 'num_probe': num_probe, 'num_base_overlap': base_hit, 'loci': "{0}:{1}-{2}".format(r.sID, r.sStart, r.sEnd), 'genes': ",".join(genes_seen) } writer.writerow(rec) f.close()
def summarize_GMAP_sam(input_fa_or_fq, input_sam): d = dict((r.id, len(r.seq)) for r in SeqIO.parse( open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) map_count = defaultdict(lambda: 0) for r in BioReaders.GMAPSAMReader(input_sam, True): map_count[r.qID] += 1 multi = [x for x in map_count if map_count[x] > 1] f = open(input_sam + '.summary.txt', 'w') f.write("id\tqLength\tqCoverage\tidentity\tunique\n") for r in BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d): if r.sID == '*': continue if r.qID in multi: uni = 'N' else: uni = 'Y' f.write("{0}\t{1}\t{2:.4f}\t{3:.4f}\t{4}\n".format( r.qID, d[r.qID], r.qCoverage, r.identity, uni)) f.close() print("Output written to: {0}".format(f.name), file=sys.stderr)
def err_correct(genome_file, sam_file, output_err_corrected_fasta, genome_dict=None): if genome_dict is None: genome_dict = {} for r in SeqIO.parse(open(genome_file), 'fasta'): genome_dict[r.name] = r print >> sys.stderr, "done reading", genome_file f = open(output_err_corrected_fasta, 'w') reader = BioReaders.GMAPSAMReader(sam_file, True) for r in reader: if r.sID == '*': continue seq = sp.consistute_genome_seq_from_exons(genome_dict, r.sID, r.segments, r.flag.strand) f.write(">{0}\n{1}\n".format(r.qID, seq)) f.close() print >> sys.stderr, "output written to", output_err_corrected_fasta
def categorize_aln_by_annotation(gene_annotation_file, input_fasta, input_sam, output_prefix, min_overlap_bp=200, min_query_overlap=.5, min_gene_overlap=.8): t = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr -> strand -> IntervalTree info = {} #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t') for r in DictReader(open(gene_annotation_file), delimiter='\t'): if r['#Replicon Name'] != 'chr': print("Ignore", r, file=sys.stderr) continue info[r['Locus tag']] = (int(r['Start']), int(r['Stop']), r['Locus tag']) t[r['Replicon Accession']][r['Strand']].add(int(r['Start']), int(r['Stop']), r['Locus tag']) #pdb.set_trace() result = defaultdict(lambda: []) # gene -> list of rec d = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta')) reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d) for r in reader: #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS': # pdb.set_trace() ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap) # ans is AMatch(name, strand, start, end, record) result[ans.name].append(ans) novel_ct = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) novel_list = [] novel_index = 0 f = open(output_prefix + '.sam', 'w') f.write(reader.header) f1 = open(output_prefix + '.report.txt', 'w') f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n") for k, v in result.items(): # v is: list of AMatch(name, strand, start, end, record) if k.startswith('novel-unannotated'): # write novel later, we are grouping them by loci first #tagRG='novel' for x in v: novel_ct[x.record.sID][x.strand].insert( x.start, x.end, novel_index) novel_index += 1 novel_list.append(x) continue elif k.startswith('novel-antisense'): tagRG = 'novel-antisense' elif k.startswith('novel-partial'): tagRG = 'novel-partial' elif k.startswith('poly-'): tagRG = 'poly' else: tagRG = 'single' v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, tagRG, k)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end)) else: # - strand, start is end, end is start f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1)) # now write the novel stuff, grouped by regions novel_region_index = 1 for d1 in novel_ct.values(): for ct in d1.values(): gn = 'novel-' + str(novel_region_index) for _start, _end, _indices in ct.getregions(): v = [novel_list[ind] for ind in _indices] v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, "novel-unannotated", gn)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end)) else: f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1)) novel_region_index += 1 f.close() f1.close() print("Output written to:", f.name, file=sys.stderr) print("Output written to:", f1.name, file=sys.stderr)