def write_fasta(loc, tmpdir=TMPDIR): fasta = fastas[loc.org] seq = fasta[loc.seqid] fname = "%s/%s_%i_%i.fasta" % (tmpdir, loc.seqid, loc.start, loc.end) if op.exists(fname): return fname start, end = sorted((loc.start, loc.end)) seq = seq[start - 1: end] if loc.rc: seq = complement(seq)[::-1] fh = open(fname, "w") print >>fh, ">%s\n%s" % (fh.name, seq) return fh.name
def main(reads, tags, fmt): gen_record = gen_record_from_raw if fmt == "raw" else gen_record_from_fastq for record in gen_record(reads): # TODO: support case where there are no tags and just want to add both normal # and rc read to new file. if tags: seq = record[1] if seq.startswith(tags[0]): seq = seq[len(tags[0]):] elif seq.startswith(tags[1]): seq = complement(seq[len(tags[1]):])[::-1] else: print >>sys.stderr, "warning:%s does not start with specified tags!" record[1] = seq if len(record) > 2: record[3] = record[3][-len(seq):][::-1] # strip the stuff for tags. print "\n".join(record) """
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, revcomp_strand, min_counts, offset_min, offset_max, region_size, ignore_chroms, only_chroms, verbose): ''' main routine for calculating nuc_counts ''' if verbose: msg = ">> analyzing sequences ...\n" msg += ">> ignore:%s only:%s\n" % \ (str(ignore_chroms), str(only_chroms)) msg += ">> offset range: %d to %d\n" % (offset_min, offset_max) msg += ">> region size: %d\n" % (region_size) msg += ">> revcomp strand: %s\n" % str(revcomp_strand) print >>sys.stderr, msg seq_fasta = Fasta(fasta_filename) nuc_counts = defaultdict(Counter) bedtools = (pos_signal_bedtool, neg_signal_bedtool) strands = ('+', '-') # total number of sites examined total_sites = 0 for bedtool, strand in izip(bedtools, strands): for row in bedtool: # skip data based on specified chromosomes if row.chrom in ignore_chroms: continue if only_chroms and row.chrom not in only_chroms: continue # skip data if counts are too low if row.count < min_counts: continue # sites in bedgraph examined - must come after all checks # above total_sites += 1 for offset in range(offset_min, offset_max + 1): # upstream offsets are negative values if strand == '+': start = row.start + offset elif strand == '-': start = row.start - offset if region_size == 1: # half open at the position of interest end = start + region_size else: # make sure that the 3' most position in a region # is the base of interest if strand == '+': end = start + 1 # include position with + 1 start = end - region_size else: # negative strand end = start + region_size # XXX: does this ever happen? if start < 0: continue nucs = seq_fasta[row.chrom][start:end] # 1. libs where the captured strand is sequenced # are the correct polarity as-is (i.e. Excision-seq # libs) # 2. libs where the *copy* of the captured strand # is sequenced should be revcomplemented (i.e. # circularization-based libs) if (strand == '+' and revcomp_strand) or \ (strand == '-' and not revcomp_strand): nucs = complement(nucs[::-1]) if len(nucs.strip()) != region_size: continue nuc_counts[offset][nucs] += row.count return total_sites, nuc_counts
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, revcomp_strand, min_counts, offset_min, offset_max, region_size, ignore_chroms, only_chroms, verbose): ''' main routine for calculating nuc_counts ''' if verbose: msg = ">> analyzing sequences ...\n" msg += ">> ignore:%s only:%s\n" % \ (str(ignore_chroms), str(only_chroms)) msg += ">> offset range: %d to %d\n" % (offset_min, offset_max) msg += ">> region size: %d\n" % (region_size) msg += ">> revcomp strand: %s\n" % str(revcomp_strand) print >> sys.stderr, msg seq_fasta = Fasta(fasta_filename) nuc_counts = defaultdict(Counter) bedtools = (pos_signal_bedtool, neg_signal_bedtool) strands = ('+', '-') # total number of sites examined total_sites = 0 for bedtool, strand in izip(bedtools, strands): for row in bedtool: # skip data based on specified chromosomes if row.chrom in ignore_chroms: continue if only_chroms and row.chrom not in only_chroms: continue # skip data if counts are too low if row.count < min_counts: continue # sites in bedgraph examined - must come after all checks # above total_sites += 1 for offset in range(offset_min, offset_max + 1): # upstream offsets are negative values if strand == '+': start = row.start + offset elif strand == '-': start = row.start - offset if region_size == 1: # half open at the position of interest end = start + region_size else: # make sure that the 3' most position in a region # is the base of interest if strand == '+': end = start + 1 # include position with + 1 start = end - region_size else: # negative strand end = start + region_size # XXX: does this ever happen? if start < 0: continue nucs = seq_fasta[row.chrom][start:end] # 1. libs where the captured strand is sequenced # are the correct polarity as-is (i.e. Excision-seq # libs) # 2. libs where the *copy* of the captured strand # is sequenced should be revcomplemented (i.e. # circularization-based libs) if (strand == '+' and revcomp_strand) or \ (strand == '-' and not revcomp_strand): nucs = complement(nucs[::-1]) if len(nucs.strip()) != region_size: continue nuc_counts[offset][nucs] += row.count return total_sites, nuc_counts