def test_process_bam_mismatches(): tbam = os.path.join(DATA, "tmp.bam") bam = os.path.join(DATA, "ordered_umi.bam") if os.path.exists(tbam): os.remove(tbam) with captured_output() as (out, err): process_bam(bam, tbam, mismatches=1) assert os.path.exists(tbam) it = iter(out.getvalue().split("\n")) assert it.next().strip() == "1\t9\t10\t4\t2" assert it.next().strip() == "1\t11\t12\t2\t1" assert it.next().strip() == "1\t29\t30\t2\t1" bam_reader = Samfile(tbam) it = iter(bam_reader) r = it.next() assert r.pos == 4 assert r.qname == "read8:UMI_ATTCAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read1:UMI_AAAAAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read4:UMI_AAAGGGGG" r = it.next() assert r.pos == 11 assert r.qname == "read5:UMI_ATTTAGGG" bam_reader.close() os.remove(tbam)
def get_sorted_aligned_reads(args, header, sequence): if args.reference_hash and os.path.exists(args.reference_hash): print("Loading index...") ref_index = load_hash(args.reference_hash) else: print("Computing reference index...") ref_index = build_hashtable(sequence, args.kmer, args.stride) save_hash(*ref_index, file=args.reference_hash) print("Verifying hash...") for hash_, offset_ in islice(ref_index[0].iteritems(), 20): if not verify_hash(sequence, offset_, args.kmer, hash_): raise ValueError( 'Index failed to verify: offset {} has mismatching hashes'. format(offset_)) print("Aligning reads...") pair_iterator = read_paired_fasta(args.reads_file) sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg') sam_iterator = iter(sorted(sam_iterator, cmp=compsam)) if args.out_bam: outfile = Samfile(args.out_bam, 'wb', header=SAM_HEADER(header, sequence)) for read in sam_iterator: outfile.write(read) outfile.close() infile = Samfile(args.out_bam, 'rb') sam_iterator = infile return sam_iterator
def single_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for single_sam in to_process: sam = Samfile(single_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close() return match
def callbase(bamfile, snpsites, out): BF = Samfile(bamfile, 'rb') #open your bam file SF = open(snpsites, 'r') #the file contain snp sites info RF = open(out, 'w') #resulte file RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n') for i in SF: if i.startswith('#'): continue else: line = ParseSNPsitesLine(i) vcf_pos = line.pos-1 #change 1-base to 0-based vcf_refname = line.chrom print 'processing: %s %s...'%(vcf_refname, str(vcf_pos)) At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0 for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1): if i.pos == vcf_pos: vcf_Rbase = line.Rbase vcf_Abase = line.Abase for j in i.pileups: yourbase = j.alignment.seq[j.qpos] if yourbase == 'A': At += 1 elif yourbase == 'T': Tt += 1 elif yourbase == 'C': Ct += 1 elif yourbase == 'G': Gt += 1 elif yourbase == 'N': Nt += 1 else: othert += 1 RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \ str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \ str(Nt), str(othert))) BF.close()
def create_table(half_ext, feature_summit_file_name, bam_names, bam_counts, bam_list, output_file_name): # Initialization outLoc = "/".join(output_file_name.split("\t")[:-1]) + "/" command = "mkdir -p "+outLoc os.system(command) # Allowed chromosomes chrList = ["chr"+str(e) for e in range(1,23)+["X"]] # Fetching regions featureSummitFile = open(feature_summit_file_name,"r") regionList = [] for line in featureSummitFile: ll = line.strip().split("\t") if(ll[0] not in chrList): continue region = [ll[0], int(ll[1])-half_ext, int(ll[2])+half_ext] if(int(region[1]) < 0): continue regionList.append(region) featureSummitFile.close() # Creating table matrix = [] for i in range(0,len(bam_list)): inputBamFileName = bam_list[i] correctFactor = int(bam_counts[i])/1000000 extension = inputBamFileName.split(".")[-1] if(extension == "bam"): bamFile = Samfile(inputBamFileName,"rb") vec = [] for region in regionList: try: bamSignal = fetchSignal(bamFile, region) / correctFactor except Exception: bamSignal = 0 vec.append(bamSignal) elif(extension == "bw" or extension == "bigwig"): bamFile = pyBigWig.open(inputBamFileName) vec = [] for region in regionList: try: bamSignal = fetchSignalBw(bamFile, region) / correctFactor except Exception: bamSignal = 0 vec.append(bamSignal) else: print("The tool supports only BAM or BIGWIG files.") matrix.append(vec) bamFile.close() outputFile = open(output_file_name,"w") outputFile.write("\t".join(bam_names)+"\n") for j in range(0,len(matrix[0])): vec = [] for i in range(0,len(matrix)): try: vec.append(str(matrix[i][j])) except Exception: vec.append("NA") outputFile.write("\t".join(vec)+"\n") outputFile.close()
def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual): startTime = Helper.getTime() minDistance = int(minDistance) counter = 0 j = 0 num_lines = len(self.variantDict) Helper.info( " [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"), str(minDistance)), self.logFile, self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter += 1 if counter % 10000 == 0: Helper.status('%s mm parsed ' % counter, self.logFile, self.textField, "grey") keepSNP = False varPos = variant.position - 1 iter = bamFile.pileup(variant.chromosome, variant.position - 1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance = abs( pileupread.alignment.alen - pileupread.query_position ) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[ pileupread. query_position] == variant.alt and pileupread.alignment.query_qualities[ pileupread. query_position] >= minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP = True if keepSNP == False: j += 1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j, num_lines), self.logFile, self.textField, "black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def main(args=None): if args is None: args = sys.argv[1:] f = Samfile(args[0]) header = f.header f.close() reflen = header['SQ'][0]['LN'] BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header) return 0
def paired_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for paired_sam in to_process: r1_match = {} r2_match = {} sam = Samfile(paired_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) # print query_name, align_len, query_aligned_len nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= 90: if align.is_read1: r1_match.setdefault(query_name, {}) r1_match[query_name].setdefault(ref_name, []) r1_match[query_name][ref_name].append( paired_perc_id) if align.is_read2: r2_match.setdefault(query_name, {}) r2_match[query_name].setdefault(ref_name, []) r2_match[query_name][ref_name].append( paired_perc_id) sam.close() for query in set(r1_match.keys()).intersection(set(r2_match.keys())): for ref in set(r1_match[query].keys()).intersection( r2_match[query].keys()): average_perc_id = calcola_media( [max(r1_match[query][ref]), max(r2_match[query][ref])]) if average_perc_id >= identity_threshold: match.setdefault(query, set()) match[query].add(ref) return match
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random()**i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + '_min' else: outdir = outdir_base + '{:04.1f}M'.format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, '->', outdir stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def expression_dict_from_bam(alias_dict, gene_dict, exp_file_name): # Fetching expression exp_dict = dict() exp_file = Samfile(exp_file_name, "rb") for k in gene_dict.keys(): geneVec = gene_dict[k] gene = geneVec[3] region = [geneVec[0], int(geneVec[1]), int(geneVec[2])] exp = fetch_counts(exp_file, region) exp_dict[gene] = float(exp) / (region[2] - region[1]) exp_file.close() # Returning objects return exp_dict
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), "subset") sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random() ** i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + "_min" else: outdir = outdir_base + "{:04.1f}M".format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[: int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, "->", outdir stdout.flush() of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def create_hic_file(chrom_sizes_file_name, ctcf_peaks_file_name, ctcf_motifs_file_name, loops_file_name, loops_hiccups_output_file_name): # Parameters outLoc = "/".join(loops_hiccups_output_file_name.split("/")[:-1]) + "/" command = "mkdir -p " + outLoc os.system(command) # Chrom sizes chrom_list, chrom_dict = read_chromosome_sizes(chrom_sizes_file_name) # Reading loop list loop_list = read_loop_list(chrom_list, loops_file_name) # Hiccups Header hic_header = [ "chr1", "x1", "x2", "chr2", "y1", "y2", "color", "o", "e_bl", "e_donute_h", "e_v", "fdr_bl", "fdr_donut", "fdr_h", "fdr_v", "num_collapsed", "centroid1", "centroid2", "radius", "motif_x1", "motif_x2", "sequence_1", "orientation_1", "uniqueness_1", "motif_y1", "motif_y2", "sequence2", "orientation_2", "uniqueness_2" ] # Opening CTCF files if (os.path.isfile(ctcf_peaks_file_name) and os.path.isfile(ctcf_motifs_file_name)): ctcf_peaks_file = Samfile(ctcf_peaks_file_name, "rb") ctcf_motifs_file = Samfile(ctcf_motifs_file_name, "rb") else: ctcf_peaks_file = None ctcf_motifs_file = None # Writing hiccups file write_hiccups_file(hic_header, loop_list, ctcf_peaks_file, ctcf_motifs_file, loops_hiccups_output_file_name) # Closing bam files if (os.path.isfile(ctcf_peaks_file_name) and os.path.isfile(ctcf_motifs_file_name)): ctcf_peaks_file.close() ctcf_motifs_file.close()
def get_sorted_aligned_reads(args, header, sequence): if args.ref_idx and os.path.exists(args.ref_idx): print("Loading index...") ref_index = load_hash(args.ref_idx) else: print("Computing reference index...") ref_index = build_hashtable(sequence, args.kmer, args.stride) print("Verifying hash...") for hash_, offset_ in islice(ref_index[0].iteritems(), 20): if not verify_hash(sequence, offset_, args.kmer, hash_): raise ValueError( 'Index failed to verify: offset {} has mismatching hashes'. format(offset_)) print("Aligning reads...") pair_iterator = read_paired_fasta(args.reads_file) sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw1_rg') print('Sorting SAMRecords in memory...') sam_iterator = iter(sorted(sam_iterator, cmp=compsam)) if args.out_bam: header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'SN': header[1:], 'LN': len(sequence) }], 'RG': [{ 'ID': 'hw1_rg', 'SM': SAMPLE_NAME, 'PU': 'Unknown', 'PL': 'Unknown', 'LB': 'Unknown' }] } outfile = Samfile(args.out_bam, 'wb', header=header) for read in sam_iterator: outfile.write(read) outfile.close() infile = Samfile(args.out_bam, 'rb') sam_iterator = infile return sam_iterator
def create_table(downstream_extension, upstream_extension, number_of_bins, number_of_counts, alias_file_name, gene_list_file_name, regions_file_name, signal_file_type, signal_file_name, output_gene_file_name): # Fetch alias dictionary aliasDict = create_alias_dictionary(alias_file_name) # Fetch gene list dict if(gene_list_file_name == "."): gene_list_file_name = None geneListDict = create_gene_list_dictionary(aliasDict, gene_list_file_name) # Fetch genes and enhancers geneDict = genes_dictionary(aliasDict, geneListDict, regions_file_name) # Opening signal file if(signal_file_type == "bam"): signal_file = Samfile(signal_file_name, "rb") elif(signal_file_type == "bw"): signal_file = pyBigWig.open(signal_file_name) # Writing meta signals write_meta_signals(downstream_extension, upstream_extension, number_of_bins, number_of_counts, geneDict, signal_file_type, signal_file, output_gene_file_name) signal_file.close()
def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual): startTime=Helper.getTime() minDistance=int(minDistance) counter=0;j=0 num_lines = len(self.variantDict) Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter+=1 if counter%10000==0: Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey") keepSNP=False varPos=variant.position-1 iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP=True if keepSNP==False: j+=1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def get_promoters(promoter_ext, alias_dict, genomic_regions_file_name, stag_regions_file_name): # Initialization regionFile = open(genomic_regions_file_name, "rU") stagRegionsFile = Samfile(stag_regions_file_name, "rb") allList = [] stagDict = dict() # Iterating on region file for line in regionFile: # Fetching data ll = line.strip().split("\t") nn = ll[3].split(":") chrom = ll[0]; start = ll[1]; end = ll[2]; strand = ll[5]; region = nn[0]; name = nn[1]; activity = nn[2] if(region != "PROMOTER" or activity == "INACTIVE"): continue # Gene name try: gene = alias_dict[name.upper()] except Exception: gene = name.upper() promoterWriteVec = [chrom, start, end, gene, "0", strand] # Appending gene to allList allList.append(promoterWriteVec) # Check whether promoter intersect both stag regions promoterRegion = [chrom, max(int(start)-promoter_ext, 0), int(end)+promoter_ext] check = check_bam_at_least_one_read(stagRegionsFile, promoterRegion) if(not check): continue stagDict[gene] = promoterWriteVec # Termination regionFile.close() stagRegionsFile.close() # Returning objects return allList, stagDict
def merge_interregional_bams(input_paths, output_path, regions_dir): """Merges regional sorted BAMs Should be used for regional BAMs that are sorted and have defined intervals. Written to work with XGAP pipeline only, specifically using config to find interval files. Args: input_paths: List of regional BAMs, in proper order output_path: Path to output merged BAM regions_dir: Path to directory with interval files config: Config dict for XGAP """ n_regions = len(input_paths) sample = Samfile(input_paths[0], 'rb') sample_header = dict(sample.header) sample.close() seq_dict = {} for i, seq in enumerate(sample_header["SQ"]): seq_dict[seq["SN"]] = i #regions_dir = "{}/regions/{}/".format(config['output-dir'], n_regions) with Samfile(output_path, 'wb', header=sample_header) as out_file: for index, input_path in enumerate(input_paths): # Object Info: region['ref_id'] = [lower_bound, upper_bound] region = _load_region(index, regions_dir, n_regions, seq_dict) started = False with Samfile(input_path, 'rb') as in_file: for alignment in in_file: pos = alignment.reference_start ref_id = alignment.reference_id if region[ref_id][0] <= pos <= region[ref_id][1]: if not started: started = True out_file.write(alignment) else: # Reached end of interval, move to next file if started: break
def split_samfile(sam_file, splits, prefix='', path=''): """Take a sam file and split it splits number of times. :path: Where to put the split files. :prefix: A prefix for the outfile names. :returns: A tuple of job files. """ # Determine how many reads will be in each split sam file. num_lines = count_reads(sam_file) num_reads = int(int(num_lines)/splits) + 1 # Get rid of starting path sam_name = os.path.basename(sam_file) # Subset the SAM file into X number of jobs cnt = 0 currjob = 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) rmode = 'rb' if sam_name.split('.')[0] == 'bam' else 'r' wmode = 'wb' # Actually split the file outfiles = [run_file] with Samfile(sam_file, rmode) as in_sam: sam_split = Samfile(run_file, wmode, template=in_sam) for line in in_sam: cnt += 1 if cnt < num_reads: sam_split.write(line) elif cnt == num_reads: # Check if next line is mate-pair. If so, don't split. line2 = next(in_sam) currjob += 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) new_sam = Samfile(run_file, wmode, template=in_sam) outfiles.append(run_file) if line.qname == line2.qname: sam_split.write(line) sam_split.write(line2) sam_split.close() cnt = 0 else: sam_split.write(line) sam_split.close() new_sam.write(line2) cnt = 0 sam_split = new_sam sam_split.close() return tuple(outfiles)
def parse_barcode(bamfile): """parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome and writes to a file, create file for mutant and wildtype based on barcodes""" samfile = Samfile(bamfile, "rb") multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile) mutant = Samfile("Mutant.bam","wb",template=samfile) wildtype = Samfile("Wildtype.bam","wb",template=samfile) for line in samfile.fetch(): #if line.is_secondary: ## does this hit to more than one spot in genome # multi_hit_file.write(line) if "#GAGT"in line.qname or "#TTAG" in line.qname: ## write to mutant file mutant.write(line) elif "#ACCC" in line.qname or "#CGTA" in line.qname: ### write to wildtype file wildtype.write(line) multi_hit_file.close() mutant.close() wildtype.close() samfile.close()
value = float(ll[0]) if(value < 0): value -= pseudocount if(value > 0): value += pseudocount eigenVec = [key, value] if(value > 0): posReadCount += float(fetchTotalReadsBam(dnaseFile, [chromosome, p1, p2])) posCount += 1 if(value > maxPos): maxPos = value elif(value < 0): negReadCount += float(fetchTotalReadsBam(dnaseFile, [chromosome, p1, p2])) negCount += 1 if(value < minNeg): minNeg = value position += resolution eigenList.append(eigenVec) eigenFile.close() dnaseFile.close() posReadCount = posReadCount/posCount negReadCount = negReadCount/negCount maxPos = round(maxPos * 100,4) minNeg = round(minNeg * 100,4) # Checking if signal change is needed if(negReadCount > posReadCount): for i in range(0,len(eigenList)): if(eigenList[i][1] == 0): eigenList[i][1] = -maxPos continue eigenList[i][1] = round(-eigenList[i][1] * 100,4) else: for i in range(0,len(eigenList)): if(eigenList[i][1] == 0):
def _sort_alignments_by_region(bwa_cmd, regions, output_dir, basename, log_output=stdout): """Sort bwa alignments into specified regions Sorts alignments from bwa command into separate files for each specified region. Sorts unmapped and qc fail reads into separate files. Paired reads that fall into different regions are added to chrI file. Args: bwa_cmd: String for full bwa command (Ex. "bwa mem -M -t 1 ref.fa test.fq") regions: A dict storing region information. regions['seq'] = [list of regions that encompass a part of 'seq'] output_dir: Directory storing region subdirectories that store alignments basename: File prefix for output BAM files. log_output: Handle for log output Returns: output_paths: List of all output files. """ regional_reads, header, seq_dict = {}, {}, {} curr_reads = [] read = None unmapped_path = "{}/unmapped/{}.bam".format(output_dir, basename) qcfail_path = "{}/qcfail/{}.bam".format(output_dir, basename) log_output.write("executing BWA\n") start = time() bwa_process = Popen(bwa_cmd, stdout=PIPE, stderr=log_output, shell=True) for line in bwa_process.stdout: line = line.decode("utf-8") if line[0] == '@': _add_line_to_header(line, header) continue if not seq_dict: for index, seq in enumerate(header["SQ"]): header["SQ"][index]["LN"] = int(header["SQ"][index]["LN"]) seq_dict[seq["SN"]] = index seq_dict[index] = seq["SN"] unmapped_file = Samfile(unmapped_path, 'wb', header=header) qcfail_file = Samfile(qcfail_path, 'wb', header=header) read = _string_to_aligned_segment(line, seq_dict, log_output) #conditionals to group bwa output by read names and sort one group at a time if not curr_reads: curr_reads.append(read) continue if curr_reads[0].query_name == read.query_name: curr_reads.append(read) continue else: _read_sorter(curr_reads, unmapped_file, qcfail_file, regions, regional_reads, seq_dict, log_output) curr_reads.clear() curr_reads.append(read) if curr_reads: _read_sorter(curr_reads, unmapped_file, qcfail_file, regions, regional_reads, seq_dict, log_output) curr_reads.clear() _add_line_to_header("@HD\tSO:coordinate", header) output_paths = _sort_regional_reads(regional_reads, regions, output_dir, basename, header, log_output) end = time() log_output.write("Sorted {} regions in {} " "seconds\nDone\n".format(len(output_paths), (end - start))) log_output.flush() fsync(log_output.fileno()) output_paths.append(unmapped_path) output_paths.append(qcfail_path) unmapped_file.close() qcfail_file.close() return output_paths
def _sort_regional_reads(regional_reads, regions, output_dir, basename, header, log_output): """Sorts reads in dict struct by coordinate and writes to BAM files Uses counting sort with the intuition that most positions in a range will map to a non-empty bucket, where a bucket holds reads mapped to a certain position. Args: regional_reads: Complicated dict struct ex. regional_reads[region][chromosome][position] returns list of reads at this position in this chromosome in this region regions: Regions dict used to sort reads during alignment. Produced by _load_regions() output_dir: self-explanatory basename: Prefix for output BAM files header: Dict representation of SAM header (described in pysam docs) log_output: File object for log. Returns: output_paths: List of output file paths """ output_paths = [] region_intervals = {} sequence_order = [chrom["SN"] for chrom in header["SQ"]] region_names = set(["chri"]) # Get region intervals for seq_id in sequence_order: for interval in regions[seq_id]: region_names.add(interval.name) if interval.name in region_intervals: region_intervals[interval.name].append( [seq_id, interval.lower_bound, interval.upper_bound]) else: region_intervals[interval.name] = [[ seq_id, interval.lower_bound, interval.upper_bound ]] #for region in regional_reads.keys(): for region in region_names: output_path = "{}/{}/{}.bam".format(output_dir, region, basename) output_file = Samfile(output_path, 'wb', header=header) if region == "chri": for chromosome in sequence_order: if chromosome in regional_reads[region]: for position in sorted( regional_reads[region][chromosome].keys()): for read in regional_reads[region][chromosome].pop( position): output_file.write(read) elif region in regional_reads: for entries in region_intervals[region]: chromosome = entries[0] low, high = entries[1], entries[2] if chromosome in regional_reads[region]: for position in range(low, high + 1): if position in regional_reads[region][chromosome]: for read in regional_reads[region][chromosome].pop( position): output_file.write(read) else: log_output.write("No reads mapping to " "{} {}\n".format(region, chromosome)) log_output.flush() fsync(log_output.fileno()) output_paths.append(output_path) output_file.close() return output_paths
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-','+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4,contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4,len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()
from pysam import Samfile def filter_reads(sam, mapq, osam): for aread in sam: if aread.is_unmapped: continue if aread.mapq >= mapq: osam.write(aread) if __name__ == '__main__': mapq = int(sys.argv[1]) for fn in sys.argv[2:]: isam = Samfile(fn) if fn.endswith('bam'): ofn = fn.replace('bam', 'map%s.bam' % mapq) elif fn.endswith('sam'): ofn = fn.replace('sam', 'map%s.bam' % mapq) else: ofn = fn + ".mapq%s.bam" % mapq if os.path.exists(ofn): print("Error:", ofn, "already exists!") continue osam = Samfile(ofn, 'wb', template=isam) filter_reads(isam, mapq, osam) osam.close()
""" ret = [] for i in re.findall("\d+|\^?[ATCGN]+", md): if i.startswith('^'): ret.extend(list(i[1:])) elif i[0] in ["A", "T", "C", "G", "N"]: ret.extend(list(i)) else: ret.extend(['-'] * int(i)) return ret if __name__ == '__main__': f = Samfile(sys.argv[1]) out = Samfile(sys.argv[1][:-4] + "_realign.bam", 'wb', template=f) count = 0.0 n = 0.05 for read in f: q, t = expandAlign(read) query, target = realign(read) replace(read, query, target) out.write(read) count += 1 if (count / f.mapped) > n: n += 0.05 print "[%s] -- parsed %d of %d reads (%.2f)" % ( time.asctime(), int(count), f.mapped, count / f.mapped) out.close()
""" Turns abbreviated MD into a full array """ ret = [] for i in re.findall("\d+|\^?[ATCGN]+", md): if i.startswith('^'): ret.extend(list(i[1:])) elif i[0] in ["A","T","C","G","N"]: ret.extend(list(i)) else: ret.extend(['-']*int(i)) return ret if __name__ == '__main__': f = Samfile(sys.argv[1]) out = Samfile(sys.argv[1][:-4]+"_realign.bam",'wb', template=f) count = 0.0 n = 0.05 for read in f: q,t = expandAlign(read) query, target = realign(read) replace(read, query, target) out.write(read) count += 1 if (count / f.mapped) > n: n += 0.05 print "[%s] -- parsed %d of %d reads (%.2f)" % (time.asctime(), int(count), f.mapped, count/f.mapped ) out.close()
outputFile.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n") fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) outputFile.write(str(round(zf+zr,4))+"\n") #print i+p1+1-(window/2), af[i], ar[i], fSum, rSum, Nf[i-(window/2)], Nr[i-(window/2)] fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] #for i in range(p1, p2): # print i+1, z[i-p1] except Exception: continue # Closing files bamFile.close() fastaFile.close() coordFile.close() outputFile.close() # Converting to bigwig os.system(" ".join(["wigToBigWig",outputFileName,csFileName,outputFileName[:-3]+"bw"])) os.system(" ".join(["wigToBigWig",outputFileNameRaw,csFileName,outputFileNameRaw[:-3]+"bw"])) #os.system(" ".join(["rm",outputFileName]))
r21p = 0.27 r22p = 0.27 r23p = 0.27 else: r11p = 0.6 r12p = 0.6 r13p = 0.6 r21p = 0.6 r22p = 0.6 r23p = 0.6 signal1 = [ e * r11p for e in fetchSignalBam(signalFile, region11, bamExt) ] + [e * r12p for e in fetchSignalBam(signalFile, region12, bamExt) ] + [e * r13p for e in fetchSignalBam(signalFile, region13, bamExt)] signal2 = [ e * r21p for e in fetchSignalBam(signalFile, region21, bamExt) ] + [e * r22p for e in fetchSignalBam(signalFile, region22, bamExt) ] + [e * r23p for e in fetchSignalBam(signalFile, region23, bamExt)] signal = signal1 + signal2 # Updating vector vector = vector + signal # Writing vector outputFile.write("\t".join([str(e) for e in vector]) + "\n") # Closing all files ctcfFile.close() outputFile.close() signalFile.close()
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict(); obsDictR = dict() expDictF = dict(); expDictR = dict() ct_reads_r=0 ct_reads_f=0 ct_kmers=0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift else: p1 = r.aend - (k_nb/2) + 1 - shift p2 = p1 + k_nb # Verifying PCR artifacts if(p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if(trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if(not r.is_reverse): ct_reads_r+=1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_f+=1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0,len(currStr)-k_nb): ct_kmers+=1 # Counting k-mer in dictionary s = currStr[i:i+k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i+k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A","C","G","T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e,0.0) for e in kmerComb]) bias_table_R = dict([(e,0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6) # Return return [bias_table_F, bias_table_R]
align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing( align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= coverage: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ( (align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close() else: print "no mapping data" sys.exit() if paired_sam is not None: if os.path.exists(paired_sam): r1_match = {} r2_match = {} sam = Samfile(paired_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(
def fix_bigwig(chromosome, chromSizesFileName, chromSizesFileEnhName, mainBamFileName, toAddBamFileNameList, toRemoveBamFileNameList, outWigFileName): # Fixed parameters GENOME_WINDOW_SIZE = 1000000 WINDOW_SIZE = 10 TOTAL_BINS = GENOME_WINDOW_SIZE / WINDOW_SIZE # Get chrom sizes chrom_list, genome_sizes_dict = get_chrom_sizes(chromSizesFileName) # Open bam and wig files outWigFile = open(outWigFileName, "w") mainBamFile = Samfile(mainBamFileName, "rb") toAddBamFileList = [Samfile(e, "rb") for e in toAddBamFileNameList] toRemoveBamFileList = [Samfile(e, "rb") for e in toRemoveBamFileNameList] # Wig header wig_header = "fixedStep chrom=" + chromosome + " start=1 step=" + str( WINDOW_SIZE) outWigFile.write(wig_header + "\n") # Iterating on genomic regions for memory purposes for i in range(0, genome_sizes_dict[chromosome], GENOME_WINDOW_SIZE): # Region to fetch the signal region = [ chromosome, i, min(i + GENOME_WINDOW_SIZE, genome_sizes_dict[chromosome]) ] # Fetch signals vector_list_add = [] vector_list_rm = [] mainSignal = fetchSignalBam(TOTAL_BINS, region, mainBamFile) for j in range(0, len(toAddBamFileList)): vector_list_add.append( fetchSignalBam(TOTAL_BINS, region, toAddBamFileList[j])) for j in range(0, len(toRemoveBamFileList)): vector_list_rm.append( fetchSignalBam(TOTAL_BINS, region, toRemoveBamFileList[j])) # Writing signals for j in range(0, TOTAL_BINS): vMain = mainSignal[j] vToAdd = sum([0.2 * e[j] for e in vector_list_add]) vToRemove = sum([0.3 * e[j] for e in vector_list_rm]) outWigFile.write( str(max((vMain + vToAdd) - vToRemove, 0.0)) + "\n") # Termination mainBamFile.close() outWigFile.close() for e in toAddBamFileList: e.close() for e in toRemoveBamFileList: e.close() convert_to_bigwig(outWigFileName, chromSizesFileEnhName, ".".join(outWigFileName.split(".")[:-1] + ["bw"]), remove_original=False)
stagPeakFile = Samfile(stagPeakFileName, "rb") outputActiveFile = open(outputActiveFileName, "w") outputInactiveFile = open(outputInactiveFileName, "w") # Iterating on region file for line in regionFile: ll = line.strip().split("\t") chromosome = ll[0]; start = ll[1]; end = ll[2]; regionList = ll[3].split(":"); score = ll[4]; strand = ll[5] if(regionList[0] != "PROMOTER"): continue check = check_bam_at_least_one_read(stagPeakFile, [chromosome, int(start)-peakExt, int(end)+peakExt]) if(not check): continue try: gene = aliasDict[regionList[1].upper()] except Exception: gene = regionList[1].upper() try: exp = expDict[gene] except Exception: continue if(regionList[2] == "ACTIVE"): outputActiveFile.write("\t".join([gene, exp])+"\n") elif(regionList[2] == "INACTIVE"): outputInactiveFile.write("\t".join([gene, exp])+"\n") # Termination regionFile.close() stagPeakFile.close() outputActiveFile.close() outputInactiveFile.close()
def subsample(fn, ns=None, paired=False): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped)/max(ns) print("Read out ", i_weight) except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print("Counted ", i_weight) i_weight /= float(max(ns)) sf = Samfile(fn) if paired: read_2s = {} print(fn, count, i_weight) for i, read in enumerate(sf): key = random()**i_weight if not paired or read.is_read1: if len(sample) < max(ns): heappush(sample, (key, i+count, read)) else: dropped = heappushpop(sample, (key, i+count, read)) if paired: read_2s.pop(dropped[-1].qname, None) elif paired: read_2s[read.qname] = read else: assert ValueError("I don't know how we got here") count += i for n in ns: outdir = outdir_base + '{:04.1f}M'.format(n/1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count)) print(fn, '->', outdir) stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos)) missing_mates = 0 for key, pos, read in sampN: of.write(read) if paired and read.is_proper_pair: if read.qname not in read_2s: missing_mates += 1 continue of.write(read_2s[read.qname]) of.close() sf.close() print(missing_mates) return [count for key, read, count in sample]
try: bbInter += ((treatInterCount / (len(ttadList) - 1)) / (contrInterCount / (len(ctadList) - 1))) except Exception: pass abIntraDict["AA"].append(aaIntra) abIntraDict["AB"].append(abIntra) abIntraDict["BA"].append(baIntra) abIntraDict["BB"].append(bbIntra) abInterDict["AA"].append(aaInter) abInterDict["AB"].append(abInter) abInterDict["BA"].append(baInter) abInterDict["BB"].append(bbInter) treatTadFile.close() controlTadFile.close() # Writing output maxV = max([max(len(abIntraDict[e]), len(abInterDict[e])) for e in abList]) outputIntraFile = open(outputIntraFileName, "w") outputInterFile = open(outputInterFileName, "w") outputIntraFile.write("\t".join(abList) + "\n") outputInterFile.write("\t".join(abList) + "\n") for i in range(0, maxV): vecIntra = [] vecInter = [] for ab in abList: try: vecIntra.append(str(abIntraDict[ab][i])) except Exception:
def print_reads(reads_to_print, ref_name, header): output_name = "{0}_{1}.bam".format(args.output_base, ref_name) output_samfile = Samfile(output_name, "wb", header=header) for aln in reads_to_print: output_samfile.write(aln) output_samfile.close()
def main(): """ what to do if we execute the module as a script (not intended for user by user) """ parser = ArgumentParser(description=__doc__) parser.add_argument('infile', default=stdin, help='BAM/SAM input file (default: stdin)') parser.add_argument('--gzip', action='store_true', default=False, help='Compress paired-end output files') parser.add_argument('outfile1', help='Output file for first mate / single reads (default: stdout)') parser.add_argument('outfile2', help='(required for paired-end files) output filename for second mate') args = parser.parse_args() context = vars(args) outfile1 = context['outfile1'] outfile2 = context['outfile2'] f = Samfile(context['infile']) incomplete_pairs = [] if context['gzip']: if PATH_TO_GZIP is not None: open_func = gzip_class_factory(PATH_TO_GZIP) fh1 = open_func(outfile1, 'w') fh2 = open_func(outfile2, 'w') else: fh1 = GzipFile(outfile1, 'wb') fh2 = GzipFile(outfile2, 'wb') is_paired = False gzwrite = gzwriter(fh1, fh2) for aread in f: is_paired = False qname = aread.qname for i in xrange(len(incomplete_pairs)): if incomplete_pairs[i].qname == qname: mate_read = incomplete_pairs.pop(i) # figure out order if aread.flag & 0x4 == 0x4: gzwrite(aread, mate_read) else: gzwrite(mate_read, aread) is_paired = True break if not is_paired: incomplete_pairs.append(aread) unpaired = len(incomplete_pairs) out1.close() out2.close() f.close() else: if not exists(outfile1): os.mknod(outfile1) if outfile2 is not None: if not exists(outfile2): os.mknod(outfile2) out1 = os.open(outfile1, os.O_WRONLY|os.O_NONBLOCK) out2 = os.open(outfile2, os.O_WRONLY|os.O_NONBLOCK) is_paired = False write = pair_writer(out1, out2) for aread in f: is_paired = False qname = aread.qname for i in xrange(len(incomplete_pairs)): if incomplete_pairs[i].qname == qname: mate_read = incomplete_pairs.pop(i) # figure out order if aread.flag & 0x4 == 0x4: write(aread, mate_read) else: write(mate_read, aread) is_paired = True break if not is_paired: incomplete_pairs.append(aread) unpaired = len(incomplete_pairs) os.close(out1) os.close(out2) f.close() if not unpaired == 0: raise RuntimeError('%d unpaired reads remaining' % unpaired)
p2 = ll[2] name = ll[3] score = ll[4] strand = ll[5] if (chrom not in chrList): continue bestMotifList = getBestMotifList(ctcfBamFile, [chrom, int(p1), int(p2)]) for motif in bestMotifList: allCtcfMinusPromFile.write("\t".join( [motif[0], motif[1], motif[2], name, motif[4], motif[5]]) + "\n") # Closing files featureFile.close() allCtcfAllPromFile.close() allCtcfPlusPromFile.close() allCtcfMinusPromFile.close() ctcfBamFile.close() # Cat tempCatFileName = tempLoc + "tempCatFileName.bed" command = "cat " + allCtcfPlusPromFileName + " " + allCtcfMinusPromFileName + " > " + tempCatFileName os.system(command) # Sorting tempSortFileName = tempLoc + "tempSortFileName.bed" command = "sort -k1,1 -k2,2n " + tempCatFileName + " > " + tempSortFileName os.system(command) # Merge command = "mergeBed -c 4,5,6 -o first,mean,first -i " + tempSortFileName + " > " + allCtcfAllPromFileName os.system(command)
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def main(argv=None): """Main script.""" ########################## # COMMAND-LINE ARGUMENTS # ########################## # Get myself program_name = sys.argv[0] if not argv: argv = sys.argv[1:] # Get the cluster type used to control arguments cluster_type = cluster.get_cluster_environment() parser = argparse.ArgumentParser( description=__doc__, add_help=False, epilog=EPILOG, formatter_class=run.CustomFormatter) req = parser.add_argument_group('Required arguments') req.add_argument('-m', '--mode', help='Operation mode', choices=['single', 'multi'], required=True, metavar='mode') req.add_argument('-s', '--snps', help='SNP BED file', required=True, metavar='<BED>') req.add_argument('-r', '--reads', help='Mapped reads file [sam or bam]', required=True, metavar='<[S/B]AM>') uni = parser.add_argument_group('Universal optional arguments') uni.add_argument('-p', '--prefix', help='Prefix for temp files and output', default='TEST', metavar='') uni.add_argument('-b', '--bam', action='store_true', dest='bam', help='Mapped read file type is bam (auto-detected if *.bam)') uni.add_argument('-n', '--noclean', action='store_true', help='Do not delete intermediate files (for debuging)') uni.add_argument('-R', '--random-seed', default=None, type=int, help='Set the state of the randomizer (for testing)') uni.add_argument('-h', '--help', action='help', help='show this help message and exit') mult = parser.add_argument_group('Multi(plex) mode arguments') mult.add_argument('-j', '--jobs', type=int, help='Divide into # of jobs', default=100, metavar='') if cluster_type == 'slurm' or cluster_type == 'torque': mult.add_argument('-w', '--walltime', help='Walltime for each job', default='3:00:00', metavar='') mult.add_argument('-k', '--mem', dest='memory', metavar='', help='Memory for each job', default='5000MB') mult.add_argument('--queue', help='Queue to submit jobs to', default='batch', metavar='') mult.add_argument('--cluster', choices=['torque', 'slurm', 'normal'], help='Which cluster to use, normal uses threads ' + 'on this machine', default=cluster_type) mult.add_argument('--threads', type=int, metavar='', default=cpu_count(), help='Max number of threads to run at a time ' + '(normal mode only).') single = parser.add_argument_group('Single mode arguments') single.add_argument('-f', '--suffix', default='', metavar='', help='Suffix for multiplexing [set automatically]') logging = parser.add_argument_group('Logging options') logging.add_argument('-q', '--quiet', action='store_true', help="Quiet mode, only prints warnings.") logging.add_argument('-v', '--verbose', action='store_true', help="Verbose mode, prints debug info too.") logging.add_argument('--logfile', help='Logfile to write messages too, default is ' + 'STDERR') args = parser.parse_args() if args.random_seed is not None: random.seed(args.random_seed) print("Seed: ", args.random_seed, random.getstate()[1][:10]) ########################################################################### # File Preparations # ########################################################################### # Take care of logging if args.logfile: logme.LOGFILE = args.logfile if args.quiet: logme.MIN_LEVEL = 'warn' elif args.verbose: logme.MIN_LEVEL = 'debug' # Initialize variables prefix = args.prefix + '_' # Make sure we can run ourselves if not run.is_exe(program_name): program_name = run.which(parser.prog) # Set the cluster type if we are in multi mode if args.mode == 'multi' and (cluster_type == 'slurm' or cluster_type == 'torque'): cluster.QUEUE = args.cluster # Check if the read file is sam or bam file_check = args.reads.split('.') file_check[-1] = file_check[-1].lower() sam_path, sam_file = os.path.split(args.reads) if args.reads.endswith('bam') or args.bam: mode = 'rb' else: mode = 'r' ################## # MULTIPLEX MODE # ################## # If we're running in multiplex mode if args.mode == 'multi': logme.log('Splitting sam file {} into {} files.'.format(sam_file, args.jobs)) reads_files = split_samfile(os.path.join(sam_path, sam_file), args.jobs, prefix) logme.log('Splitting complete.') # Create PBS scripts and submit jobs to the cluster subnoclean = ' --noclean' if args.noclean else '' logme.log('Submitting split files to cluster') jobs = [] # Hold job info for later checking for reads_file in reads_files: suffix = reads_file[-4:] command = ("python2 " + program_name + " --mode single --snps " + args.snps + " --reads " + reads_file + " --suffix " + suffix + " --prefix " + args.prefix + subnoclean + ' --bam') if cluster_type == 'normal': jobs.append(cluster.submit(command, name=prefix + suffix, threads=args.threads)) else: jobs.append(cluster.submit(command, name=prefix + suffix, time=args.walltime, cores=1, mem=args.memory, partition=args.queue)) sleep(2) # Pause for two seconds to make sure job is submitted # Now wait and check for all jobs to complete every so long logme.log('Submission done, waiting for jobs to complete.') # First wait for jobs in queue to complete cluster.wait(jobs) sleep(1) # Next, check if any jobs failed failed = [] for i in range(1, args.jobs+1): suffix = str(i).zfill(4) if not os.path.isfile(prefix + suffix + '_done'): failed.append(prefix + suffix) # If any jobs failed, terminate if failed: logme.log('Some jobs failed!', 'critical') return -1 logme.log('Jobs completed.') # Remove 'done' files in case we want to run again. os.system('rm {prefix}*_done'.format(prefix=prefix)) # Once the jobs are done, concatenate all of the counts into one file. # Initialize dictionaries tot_pos_counts = {} tot_neg_counts = {} tot_tot_counts = {} tot_sum_pos = {} tot_sum_neg = {} for i in range(1, args.jobs+1): suffix = str(i).zfill(4) in_counts = prefix + 'SNP_COUNTS_' + suffix # Parse the line to add it to the total file with run.open_zipped(in_counts, 'r') as in_counts: for line in in_counts: line = line.rstrip('\n') line_t = line.split('\t') if 'CHR' in line: continue pos = line_t[0] + '|' + line_t[1] pos_split = line_t[2].split('|') neg_split = line_t[3].split('|') if pos in tot_pos_counts or pos in tot_neg_counts or pos in tot_tot_counts: for j in range(len(pos_split)): tot_pos_counts[pos][j] += int(pos_split[j]) tot_neg_counts[pos][j] += int(neg_split[j]) tot_sum_pos[pos] += int(line_t[4]) tot_sum_neg[pos] += int(line_t[5]) tot_tot_counts[pos] += int(line_t[6]) else: tot_pos_counts[pos] = [0, 0, 0, 0] tot_neg_counts[pos] = [0, 0, 0, 0] tot_tot_counts[pos] = 0 tot_sum_pos[pos] = 0 tot_sum_neg[pos] = 0 for j in range(len(pos_split)): tot_pos_counts[pos][j] += int(pos_split[j]) tot_neg_counts[pos][j] += int(neg_split[j]) tot_sum_pos[pos] += int(line_t[4]) tot_sum_neg[pos] += int(line_t[5]) tot_tot_counts[pos] += int(line_t[6]) # Write out the final concatenated file with run.open_zipped(prefix + 'SNP_COUNTS.txt', 'w') as final_counts: final_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' + 'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n') keys = sorted(tot_pos_counts.keys()) for key in keys: pos = key.split('|') pos_fix = [str(x) for x in tot_pos_counts[key]] neg_fix = [str(x) for x in tot_neg_counts[key]] pos_out = '|'.join(pos_fix) neg_out = '|'.join(neg_fix) final_counts.write(str(pos[0]) + '\t' + str(pos[1]) + '\t' + pos_out + '\t' + neg_out + '\t' + str(tot_sum_pos[key]) + '\t' + str(tot_sum_neg[key]) + '\t' + str(tot_tot_counts[key]) + '\n') # Sort the file numerically os.system('sort -k1,2 -n ' + prefix + 'SNP_COUNTS.txt ' + ' -o ' + prefix + 'SNP_COUNTS.txt') # Clean up intermediate files. if args.noclean is False: cluster.clean() os.system('rm {prefix}*COUNTS_* {prefix}*split_sam_*'.format( prefix=prefix)) ############### # SINGLE MODE # ############### # If we're running in single mode (each job submitted by multiplex mode # will be running in single mode) elif args.mode == 'single': # First read in the information on the SNPs that we're interested in. snps = {} # Initialize a dictionary of SNP positions with run.open_zipped(args.snps) as snp_file: for line in snp_file: line = line.rstrip('\n') line_t = line.split('\t') pos = chrom_to_num(line_t[0]) + '|' + str(line_t[2]) snps[pos] = line_t[3] # This is the dictionary of potential SNPs for each read. potsnp_dict = {} # Now parse the SAM file to extract only reads overlapping SNPs. in_sam = Samfile(args.reads, mode) references = in_sam.references # Faster to make a copy of references. # Trackers to count how many reads are lost at each step indel_skip = 0 nosnp_skip = 0 count = 0 snp_count = 0 ryo_filter = 0 for line in in_sam: count += 1 # Skip lines that overlap indels OR don't match Ns cigarstring = line.cigarstring if 'D' in cigarstring or 'I' in cigarstring: indel_skip += 1 continue # Split the tags to find the MD tag: tags = line.tags for tagname, tagval in tags: if tagname == 'MD' and 'N' in tagval: # Remember that, for now, we're not allowing reads that # overlap insertions/deletions. chrom = references[line.rname] pos = line.pos read = line.seq # We're assuming # correct mapping such that FIRST MATES on the NEGATIVE # STRAND are NEGATIVE, while SECOND MATES on the NEGATIVE # STRAND are POSITIVE. if line.is_reverse: orientation = '-' else: orientation = '+' # Parse the CIGAR string cigar_types, cigar_vals = split_CIGAR(cigarstring) if cigar_types[0] == 'S': MD_start = int(cigar_vals[0]) else: MD_start = 0 # Get the genomic positions corresponding to each base-pair # of the read read_genomic_positions = CIGAR_to_Genomic_Positions( cigar_types, cigar_vals, line.pos+1) # Get the tag data MD_split = re.findall('\d+|\D+', tagval) genome_start = 0 # The snp_pos dictionary will store the 1-base position # => allele snp_pos = {} for i in MD_split: if re.match('\^', i): pass elif i.isalpha(): if i == 'N': snp_pos[read_genomic_positions[genome_start]] = read[MD_start] MD_start += 1 genome_start += 1 else: MD_start += 1 genome_start += 1 else: MD_start += int(i) genome_start += int(i) for i in snp_pos: snp_count += 1 # RYO: START EDIT - Implemented Filter posVal = line.reference_name + '|' + str(i) if posVal not in snps: nosnp_skip += 1 continue # RYO: END EDIT - Implmented Filter snp = '{chr}|{i}\t{snp_pos}\t{orientation}'.format( chr=chrom, i=i, snp_pos=snp_pos[i], orientation=orientation) if line.qname in potsnp_dict: if snp not in potsnp_dict[line.qname]: # RYO EDIT HERE - added conditional so that # pairs of reads are not considered twice if # they both overlap the same snp. potsnp_dict[line.qname].append(snp) else: ryo_filter += 1 else: potsnp_dict[line.qname] = [] potsnp_dict[line.qname].append(snp) in_sam.close() # Log all of the skipped reads logme.log('Total reads: {}'.format(count), 'debug') logme.log('Reads skipped for indels: {}'.format(indel_skip), 'debug') logme.log('Total SNPs checked: {}'.format(snp_count), 'debug') logme.log('SNPs not in SNP list: {}'.format(nosnp_skip), 'debug') logme.log('Ryo filter: {}'.format(ryo_filter), 'debug') # Initialize the counting dictionaries pos_counts = {} neg_counts = {} # Go through the potential SNP dictionary and choose one SNP at random # for those overlapping multiple SNPs if args.random_seed is not None: # Dictionaries are unordered, so must sort for consistent random seed output. keys = sorted(list(potsnp_dict.keys())) else: # Because sorting is slow, only do it if random seed is set, slowdown is about 0.1s per 1 million reads.. keys = list(potsnp_dict.keys()) for key in keys: snp = random.choice(potsnp_dict[key]).split('\t') if snp[0] in snps: if snp[0] in pos_counts or snp[0] in neg_counts: if snp[2] == '+': if snp[1] == 'A': pos_counts[snp[0]][0] += 1 if snp[1] == 'C': pos_counts[snp[0]][1] += 1 if snp[1] == 'G': pos_counts[snp[0]][2] += 1 if snp[1] == 'T': pos_counts[snp[0]][3] += 1 elif snp[2] == '-': if snp[1] == 'A': neg_counts[snp[0]][0] += 1 if snp[1] == 'C': neg_counts[snp[0]][1] += 1 if snp[1] == 'G': neg_counts[snp[0]][2] += 1 if snp[1] == 'T': neg_counts[snp[0]][3] += 1 else: pos_counts[snp[0]] = [0, 0, 0, 0] neg_counts[snp[0]] = [0, 0, 0, 0] if snp[2] == '+': if snp[1] == 'A': pos_counts[snp[0]][0] += 1 if snp[1] == 'C': pos_counts[snp[0]][1] += 1 if snp[1] == 'G': pos_counts[snp[0]][2] += 1 if snp[1] == 'T': pos_counts[snp[0]][3] += 1 elif snp[2] == '-': if snp[1] == 'A': neg_counts[snp[0]][0] += 1 if snp[1] == 'C': neg_counts[snp[0]][1] += 1 if snp[1] == 'G': neg_counts[snp[0]][2] += 1 if snp[1] == 'T': neg_counts[snp[0]][3] += 1 # Open the output file and write the SNP counts to it out_counts = prefix + 'SNP_COUNTS_' + args.suffix if args.suffix \ else prefix + 'SNP_COUNTS.txt' with open(out_counts, 'w') as out_counts: # Write header out_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' + 'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n') # Sort SNP positions and write them keys = sorted(pos_counts.keys()) for key in keys: pos = key.split('|') sum_pos = sum(pos_counts[key]) sum_neg = sum(neg_counts[key]) tot_sum = sum(pos_counts[key]) + sum(neg_counts[key]) pos_fix = [str(x) for x in pos_counts[key]] neg_fix = [str(x) for x in neg_counts[key]] positive = '|'.join(pos_fix) negative = '|'.join(neg_fix) out_counts.write(pos[0] + '\t' + pos[1] + '\t' + positive + '\t' + negative + '\t' + str(sum_pos) + '\t' + str(sum_neg) + '\t' + str(tot_sum) + '\n') if args.suffix: os.system('touch ' + prefix + args.suffix + '_done')
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = { 'HD': { 'VN': '1.0' }, 'SQ': [ dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes ] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len( fields ) == 21, 'Error reading PSL file, offending line: %d' % line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-', '+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4, contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4, len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()
if(counter == len(resVec)): break # Overlap and Writting output if(maxValue != globalMin): # t1 = MPBS t1 = [0,0] t2Write = [0,0] if(maxPos >= 0): t1 = [p1+maxPos, p1+maxPos+maxMotifLen] else: t1 = [p1-maxPos, p1-maxPos+maxMotifLen] maxOverlap = 0 for f in footprints: # t2 = footprint t2 = [f.pos,f.aend] overlapN = overlap(t1, t2) if(overlapN > maxOverlap): maxOverlap = overlapN t2Write[0] = t2[0] t2Write[1] = t2[1] resVec = [maxValue, t1[0], t1[1], maxOverlap, t2Write[0], t2Write[1]] writeOutput(ll,regionTagCount,resVec,outFile) else: writeOutput(ll,regionTagCount,resVec,outFile) # Termination bedFile.close() outFile.close() genomeFile.close() dnaseBam.close() fpBam.close()
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())] logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush() vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs for gid in sorted(job.files.keys()): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] bam = Samfile(runs[0]) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile( headerfile, "wh", header=header ) head.close() if len(runs) > 1: _b = merge_bam(ex,runs) index_bam(ex,_b) bams[gid] = _b else: bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom,ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in sorted(job.files.keys()): for chrom,ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom,v in vcfs.iteritems(): for gid,vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom)) tarfh.close() ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') ) logfile.write("\n* Merge info from vcf files\n"); logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s,'') for s in [assembly.name]+sample_names) for chrom,v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n"); logfile.flush() allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly, sample_names,mincov,float(minsnp),logfile,debugfile) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n"); logfile.flush() exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile) for snprow in allsnps: for n,k in enumerate([assembly.name]+sample_names): msa_table[k] += snprow[3+n][0] description = set_file_descr("allSNP.txt",step="SNPs",type="txt") ex.add(outall,description=description) description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt") ex.add(outexons,description=description) msafile = unique_filename_in() with open(msafile,"w") as msa: msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0]))) for name,seq in msa_table.iteritems(): msa.write("%s\t%s\n" %(name,seq)) msa_table = {} description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt") ex.add(msafile,description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n"); logfile.flush() create_tracks(ex,outall,sample_names,assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([],[],[]) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position-startpos] ref = atoi.get(ref_symbol, 4) symbols = [0,0,0,0,0] quality = 0 for pileupread in pileupcolumn.pileups: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord(pileupread.alignment.qual[pileupread.qpos])-33 quality = float(quality)/coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position+1, coverage)) if info > 0: vectors[1].append((position, position+1, info)) if quality > 0: vectors[2].append((position, position+1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs',False): _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'} for gid,bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile,format="bam") covname = unique_filename_in()+".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in()+".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in()+".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0,cinfo["length"],10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7) vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7) out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom) out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom) out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr) ex.add(covname,description=description) description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr) ex.add(hetname,description=description) description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr) ex.add(qualname,description=description) return 0
outputFile4.write("\t".join(header4) + "\n") for i in range(0, maxV): vec = [] for j in range(0, len(vectorTable4)): try: vec.append(vectorTable4[j][i]) except Exception: vec.append("NA") try: vec.append(vectorTable5[j][i]) except Exception: vec.append("NA") outputFile4.write("\t".join(vec) + "\n") stagFile.close() outputFile1.close() outputFile2.close() outputFile3.close() outputFile4.close() genomeFile.close() regionsFile.close() #chrommHmmFile.close() enhancersFile.close() [e.close() for e in signalFileList] [e.close() for e in controlFileList] [e.close() for e in motifFileList] # Removing all files command = "rm -rf " + tempLocation os.system(command)
samfile = Samfile(args.path) for segment in samfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] for etype, eset in errors.iteritems(): if(num in eset): errors2segments[etype][num].append(segment); break; additional = defaultdict(list); for fname in args.additional: tsamfile = Samfile(fname); for segment in tsamfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid))) tsamfile.close(); for etype, d in errors2segments.iteritems(): with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f: for num, segments in d.iteritems(): if(segments[0].is_reverse): seq = reverse_complement(segments[0].seq); else: seq = segments[0].seq f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq)) for segment in segments: f.write("%s\t%s\t%d\t%s\t%d\t%s\n\n" % (segment.query_name.split("|")[2], samfile.getrname(segment.tid), segment.reference_start, segment.cigarstring, segment.get_tag("AS"), segment.query_name));