예제 #1
0
파일: index.py 프로젝트: brentp/find_cns
def write_fasta(loc, tmpdir=TMPDIR):
    fasta = fastas[loc.org]
    seq = fasta[loc.seqid]
    fname = "%s/%s_%i_%i.fasta" % (tmpdir, loc.seqid, loc.start, loc.end)
    if op.exists(fname):
        return fname
    start, end = sorted((loc.start, loc.end))
    seq = seq[start - 1: end]
    if loc.rc: seq = complement(seq)[::-1]

    fh = open(fname, "w")
    print >>fh, ">%s\n%s" % (fh.name, seq)
    return fh.name
def main(reads, tags, fmt):
    gen_record = gen_record_from_raw if fmt == "raw" else gen_record_from_fastq

    for record in gen_record(reads):
        # TODO: support case where there are no tags and just want to add both normal
        # and rc read to new file.
        if tags:
            seq = record[1]
            if seq.startswith(tags[0]):
                seq = seq[len(tags[0]):]
            elif seq.startswith(tags[1]):
                seq = complement(seq[len(tags[1]):])[::-1]
            else:
                print >>sys.stderr, "warning:%s does not start with specified tags!"
            record[1] = seq
            if len(record) > 2:
                record[3] = record[3][-len(seq):][::-1] # strip the stuff for tags.
        print "\n".join(record)
        """
예제 #3
0
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, 
                    revcomp_strand, min_counts, 
                    offset_min, offset_max, region_size,
                    ignore_chroms, only_chroms, verbose):

    ''' main routine for calculating nuc_counts '''

    if verbose:
        msg =  ">> analyzing sequences ...\n"
        msg += ">> ignore:%s only:%s\n" % \
            (str(ignore_chroms), str(only_chroms))
        msg += ">> offset range: %d to %d\n" % (offset_min, offset_max)
        msg += ">> region size: %d\n" % (region_size)
        msg += ">> revcomp strand: %s\n" % str(revcomp_strand)
        print >>sys.stderr, msg

    seq_fasta = Fasta(fasta_filename)

    nuc_counts = defaultdict(Counter)

    bedtools = (pos_signal_bedtool, neg_signal_bedtool)
    strands = ('+', '-')

    # total number of sites examined
    total_sites = 0

    for bedtool, strand in izip(bedtools, strands):

        for row in bedtool:

            # skip data based on specified chromosomes
            if row.chrom in ignore_chroms:
                continue
            if only_chroms and row.chrom not in only_chroms:
                continue

            # skip data if counts are too low
            if row.count < min_counts: continue

            # sites in bedgraph examined - must come after all checks
            # above
            total_sites += 1

            for offset in range(offset_min, offset_max + 1):

                # upstream offsets are negative values
                if strand == '+':
                    start = row.start + offset
                elif strand == '-':
                    start = row.start - offset

                if region_size == 1:
                    # half open at the position of interest
                    end = start + region_size
                else:
                    # make sure that the 3' most position in a region
                    # is the base of interest
                    if strand == '+':
                        end = start + 1 # include position with + 1
                        start = end - region_size
                    else:
                        # negative strand
                        end = start + region_size

                # XXX: does this ever happen?
                if start < 0: continue

                nucs = seq_fasta[row.chrom][start:end]

                #  1. libs where the captured strand is sequenced
                #     are the correct polarity as-is (i.e. Excision-seq
                #     libs)
                #  2. libs where the *copy* of the captured strand
                #     is sequenced should be revcomplemented (i.e.
                #     circularization-based libs)

                if (strand == '+' and revcomp_strand) or \
                   (strand == '-' and not revcomp_strand):
                    nucs = complement(nucs[::-1])

                if len(nucs.strip()) != region_size: continue

                nuc_counts[offset][nucs] += row.count

    return total_sites, nuc_counts
예제 #4
0
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename,
                    revcomp_strand, min_counts, offset_min, offset_max,
                    region_size, ignore_chroms, only_chroms, verbose):
    ''' main routine for calculating nuc_counts '''

    if verbose:
        msg = ">> analyzing sequences ...\n"
        msg += ">> ignore:%s only:%s\n" % \
            (str(ignore_chroms), str(only_chroms))
        msg += ">> offset range: %d to %d\n" % (offset_min, offset_max)
        msg += ">> region size: %d\n" % (region_size)
        msg += ">> revcomp strand: %s\n" % str(revcomp_strand)
        print >> sys.stderr, msg

    seq_fasta = Fasta(fasta_filename)

    nuc_counts = defaultdict(Counter)

    bedtools = (pos_signal_bedtool, neg_signal_bedtool)
    strands = ('+', '-')

    # total number of sites examined
    total_sites = 0

    for bedtool, strand in izip(bedtools, strands):

        for row in bedtool:

            # skip data based on specified chromosomes
            if row.chrom in ignore_chroms:
                continue
            if only_chroms and row.chrom not in only_chroms:
                continue

            # skip data if counts are too low
            if row.count < min_counts: continue

            # sites in bedgraph examined - must come after all checks
            # above
            total_sites += 1

            for offset in range(offset_min, offset_max + 1):

                # upstream offsets are negative values
                if strand == '+':
                    start = row.start + offset
                elif strand == '-':
                    start = row.start - offset

                if region_size == 1:
                    # half open at the position of interest
                    end = start + region_size
                else:
                    # make sure that the 3' most position in a region
                    # is the base of interest
                    if strand == '+':
                        end = start + 1  # include position with + 1
                        start = end - region_size
                    else:
                        # negative strand
                        end = start + region_size

                # XXX: does this ever happen?
                if start < 0: continue

                nucs = seq_fasta[row.chrom][start:end]

                #  1. libs where the captured strand is sequenced
                #     are the correct polarity as-is (i.e. Excision-seq
                #     libs)
                #  2. libs where the *copy* of the captured strand
                #     is sequenced should be revcomplemented (i.e.
                #     circularization-based libs)

                if (strand == '+' and revcomp_strand) or \
                   (strand == '-' and not revcomp_strand):
                    nucs = complement(nucs[::-1])

                if len(nucs.strip()) != region_size: continue

                nuc_counts[offset][nucs] += row.count

    return total_sites, nuc_counts