def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Пример #2
0
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip( infile, ".bed.gz" )

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
    outs = IOTools.openFile( outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
        lcontig = fasta.getLength( bed.contig )

        if mode == "intervals":
            seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
            ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0,bed.start-l), bed.end-l
            ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
            start, end = bed.start+l, min(lcontig,bed.end+l)
            ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
    masked = maskSequences( seqs, masker )
    outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )

    outs.close()
Пример #3
0
def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
def annotateCpGIslands( infiles, outfile ):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
    
    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )

    for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find( start, end ))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start 
            else:
                relative_start = tss.end - genome_end
            
            relative_end = relative_start + l

            outf.write( "\t".join( map(str, (
                            tss.name, tss.strand,
                            genome_start, genome_end,
                            relative_start, relative_end ))) + "\n" )
            c.matches_output += 1

    outf.close()
            
    with IOTools.openFile( outfile + ".summary", "w" ) as outf:
        outf.write ("category\tcounts\n" )
        outf.write( c.asTable() + "\n" )
    
    E.info( c )
Пример #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
Пример #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
Пример #7
0
def main( argv = None ):

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]."  )

    parser.add_option("-n", "--per-name", dest="per_name", action="store_true",
                      help="compute counts per name [default=%default]."  )

    parser.add_option("-c", "--per-contig", dest="per_contig", action="store_true",
                      help="compute counts per contig [default=%default]."  )

    parser.add_option("-t", "--per-track", dest="per_track", action="store_true",
                      help="compute counts per track [default=%default]."  )

    parser.set_defaults(
        genome_file = None,
        per_name = False,
        per_track = False,
        )

    (options, args) = E.Start( parser, argv )

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    counts = collections.defaultdict( Counter )

    if options.per_track:
        keyf = lambda x: x.track
    elif options.per_name:
        keyf = lambda x: x.name
    elif options.per_contig:
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add( bed )

    outf = options.stdout

    key = "track"
    outf.write( "%s\t%s\n" % (key, "\t".join( Counter.headers) ))

    for key, count in counts.iteritems():
        outf.write( "%s\t%s\n" % ( key, str(count)) )
        
    E.Stop()
Пример #8
0
def quantifyAnomolies(bam, probes, outfile):

    bamfile = pysam.AlignmentFile(bam)
    results = dict()
    mapped = bamfile.mapped
    total = 0
    seeks = 0
    for probe in Bed.iterator(IOTools.openFile(probes)):
        
        c = collections.Counter()
 
        for read in bamfile.fetch(probe.contig, probe.start, probe.end,
                                  multiple_iterators=True):

            if read.is_unmapped:
                continue

            c["total"] += 1
    
            if not (read.is_secondary or read.is_supplementary):
                c["primary"] += 1
            else:
                continue

            if read.pos < (probe.start-4) or read.aend > (probe.end +4):
                c["undigested"] += 1

            if read.is_read1:
                c["read1"] += 1

            if not read.mate_is_unmapped:
                c["paired"] += 1

                if read.is_read1 and (
                        read.mpos >= probe.start and read.mpos <= probe.end):
                        c["self_lig"] += 1

            if (total + c["total"] % 10000) == 0:
                E.debug("%s/%s done" % (total + c["total"], mapped))

        E.debug("%s processed, %i found" % (probe.name, c["total"]))
        results[probe.name] = c
        total += c["total"]

    headers = ["Probe", "total", "primary", "undigested",
               "read1", "paired", "self_lig"]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(headers) + "\n")
        
        for probe in results:
            outf.write("\t".join(
                [probe]+[str(results[probe][col])
                         for col in headers[1:]]) + "\n")
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--regex-filename", dest="re_name", type="string",
                      help="regex for filename component to be used "
                      "as the annotation label")

    parser.set_defaults(
        re_name="(.+)",
    )

    (options, args) = E.Start(parser, argv=argv)

    infiles = argv[-1]
    bed_files = infiles.split(",")

    if len(bed_files) == 1:
        raise IOError("Only one file detected, cannot merge "
        "a single bed file")
    else:
        pass

    # get the regex for annotation names,
    rx = re.compile(options.re_name)
    annot_names = [[y for y in rx.search(x).groups()] for x in bed_files]
    annot_names = list(itertools.chain(*annot_names))

    # output as BED4 format: chr, start, end, name
    for fx in range(len(bed_files)):
        bfile = bed_files[fx]
        with IOTools.openFile(bfile, "r") as ofile:
            intervals = Bed.iterator(ofile)
            track_name = [bx for bx in annot_names if re.search(bx, bfile)][0]
            for entry in intervals:
                entry.__setitem__("name", track_name)
                options.stdout.write("%s\t%s\t%s\t%s\n" % (entry.contig,
                                                           entry.start,
                                                           entry.end,
                                                           entry.name))

    # write footer and output benchmark information.
    E.Stop()
Пример #10
0
def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position","count"])
Пример #11
0
def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(
            ["%s:%i-%i" % (bed.contig, bed.start, bed.end),
             str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position", "count"])
Пример #12
0
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(IOTools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
Пример #13
0
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
Пример #14
0
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(IOTools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
Пример #15
0
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
Пример #16
0
def sites2fragments(infile, genomefile, outfile):
    '''Convert bedfile of deigestion sites into bedfile of fragments'''

    contig_lengths = {
        line.split()[0]: int(line.split()[1][:-1])
        for line in IOTools.openFile(genomefile)
    }

    last_end = 0
    last_contig = None
    name = 0
    new_bed = Bed.Bed()
    new_bed["strand"] = "+"
    new_bed["score"] = "."
    with IOTools.openFile(outfile, "w") as outf:
        for bed in Bed.iterator(IOTools.openFile(infile)):

            if last_contig is not None and not bed.contig == last_contig:
                name += 1
                new_bed.start = last_end
                new_bed.contig = last_contig
                new_bed.end = contig_lengths[bed.contig]
                new_bed["name"] = str(name)

                outf.write(str(new_bed) + "\n")

                last_end = 0

            last_contig = bed.contig
            new_bed.contig = last_contig
            new_bed.start = last_end
            new_bed.end = bed.start
            name += 1
            new_bed["name"] = str(name)
            outf.write(str(new_bed) + "\n")
            last_end = bed.end

        name += 1
        new_bed.start = last_end
        new_bed.contig = last_contig
        new_bed.end = contig_lengths[bed.contig]
        new_bed["name"] = str(name)

        outf.write(str(new_bed) + "\n")

    pysam.tabix_index(outfile, force=True, preset="bed")
Пример #17
0
def sites2fragments(infile, genomefile, outfile):
    '''Convert bedfile of deigestion sites into bedfile of fragments'''

    contig_lengths = {line.split()[0]: int(line.split()[1][:-1])
                      for line in IOTools.openFile(genomefile)}

    last_end = 0
    last_contig = None
    name = 0
    new_bed = Bed.Bed()
    new_bed["strand"] = "+"
    new_bed["score"] = "."
    with IOTools.openFile(outfile, "w") as outf:
        for bed in Bed.iterator(IOTools.openFile(infile)):
       
            if last_contig is not None and not bed.contig == last_contig:
                name += 1
                new_bed.start = last_end
                new_bed.contig = last_contig
                new_bed.end = contig_lengths[bed.contig]
                new_bed["name"] = str(name)

                outf.write(str(new_bed) + "\n")
        
                last_end = 0
                
            last_contig = bed.contig
            new_bed.contig = last_contig
            new_bed.start = last_end
            new_bed.end = bed.start
            name += 1
            new_bed["name"] = str(name)
            outf.write(str(new_bed) + "\n")
            last_end = bed.end

        name += 1
        new_bed.start = last_end
        new_bed.contig = last_contig
        new_bed.end = contig_lengths[bed.contig]
        new_bed["name"] = str(name)

        outf.write(str(new_bed) + "\n")

    pysam.tabix_index(outfile, force=True, preset="bed")
Пример #18
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
Пример #19
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
Пример #20
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))
    outs = IOTools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(IOTools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
def formatProbeFragments(probe_fragments_bed, outfile):
    
    # Create strings for output collisions
    probe_collisions = ""
    exclus_collisions = ""
    
    with IOTools.openFile(outfile, "w") as outf:
    
        # dictionaries for collisions per chr
        chr_probe = {}
        chr_exclus = {}
    
        for bed_digest in Bed.iterator(IOTools.openFile(probe_fragments_bed)):
            
            # chromosome needs to specify only number (remove chr)
            chromosome = re.sub('chr', '', bed_digest.contig)
            
            out_array = []
            
            out_array.append(formatNameCompliance(bed_digest["name"]))
            out_array.append(str(chromosome))
            out_array.append(str(bed_digest.start))
            out_array.append(str(bed_digest.end))
            out_array.append(str(chromosome))
            out_array.append(str(bed_digest.start-1000))
            out_array.append(str(bed_digest.end+1000))
            out_array.append("1")
            out_array.append("A")
            
            outf.write("\t".join(out_array) + "\n")
            
            
            # Calculate collisions, all coordinates assumed in BED format
                        
            # It needs to be done per chr
            # chr_probe -> probe_ranges -> probe_range
            # chr_exclus -> exclus_ranges -> exclus_range
            
            # Check if chromosome key already exists in one of the
            # dictionaries 
            # (ranges introduced before on that chromosome
            
            if bed_digest.contig in chr_probe:
                # If it exists, get the array of probe ranges 
                # and exclusion ranges already stored
                probe_ranges = chr_probe[bed_digest.contig]
                exclus_ranges = chr_exclus[bed_digest.contig]     
            
            
            # If it doesn't create the arrays
            else:

                probe_ranges = []
                exclus_ranges = []
            
                
            # Create a range for probes and exclusion fragments
            probe_range = []
            exclus_range = []
            
            probe_range.append(bed_digest.start)
            probe_range.append(bed_digest.end)
            
            probe_ranges.append(probe_range)
            
            
            exclus_range.append(bed_digest.start-1000)
            exclus_range.append(bed_digest.end+1000)
            
            exclus_ranges.append(exclus_range)
            
            
            # Substitute the ranges back to the corresponding chr in the
            # dictionary
            chr_probe[bed_digest.contig] = probe_ranges
            chr_exclus[bed_digest.contig] = exclus_ranges
        
        
        for chr in chr_probe:
            probe_ranges = chr_probe[chr]
            exclus_ranges = chr_exclus[chr]
            probe_intersection = set.intersection(*(set(range(start, finish)) for start, finish in probe_ranges))
            exclus_intersection = set.intersection(*(set(range(start, finish)) for start, finish in exclus_ranges))
            
            if(len(probe_intersection) != 0):
                probe_collisions += "Probe collision " +str(chr)
                probe_collisions += " " +str(min(probe_intersection))
                probe_collisions += " " +str(max(probe_intersection))
                probe_collisions += "\n"
            
            if(len(exclus_intersection) != 0):
                exclus_collisions += "Exclusion collision " +str(chr)
                exclus_collisions += " " +str(min(exclus_intersection))
                exclus_collisions += " " +str(max(exclus_intersection))
                exclus_collisions += "\n"

    return (probe_collisions, exclus_collisions)
Пример #22
0
def main(argv=None):
    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-m",
                      "--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("-o",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("intervals", "leftright"),
                      help="what to output [%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at",
        dest="extend_at",
        type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help=
        "extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]"
    )

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand",
        dest="ignore_strand",
        action="store_false",
        help=
        "use strand information and return reverse complement [default=%default]"
    )

    parser.set_defaults(
        genome_file=None,
        masker=None,
        mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.mode == "intervals":
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()
Пример #23
0
def loadIntervalsFromBed(bedfile, track, outfile,
                         bamfiles, offsets):
    '''load intervals from :term:`bed` formatted files into database.

    Re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    '''

    tmpfile = P.getTempFile(".")

    headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position",
               "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end")

    tmpfile.write("\t".join(headers) + "\n")

    avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    mlength = int(PARAMS["calling_merge_min_interval_length"])

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        # remove very short intervals
        if bed.end - bed.start < mlength:
            c.skipped_length += 1
            continue

        if replicates:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelineChipseq.countPeaks(
                    bed.contig, bed.start, bed.end,
                    samfiles, offsets)

            # nreads can be 0 if the intervals overlap only slightly
            # and due to the binning, no reads are actually in the
            # overlap region.  However, most of these intervals should
            # be small and have already be deleted via the
            # merge_min_interval_length cutoff.  do not output
            # intervals without reads.
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            npeaks, peakcenter, length, avgval, peakval, nprobes = (
                1,
                bed.start +
                (bed.end - bed.start) // 2,
                bed.end - bed.start,
                1,
                1,
                1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             ncpgs, ngenes, npeaks, nprobes, npromoters,
             bed.contig, bed.start, bed.end))) + "\n")

    if c.output == 0:
        E.warn("%s - no intervals")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_intervals" % track.asTable()

    statement = '''
    cgat csv2db %(csv2db_options)s
    --allow-empty-file
    --add-index=interval_id
    --table=%(tablename)s
    < %(tmpfilename)s
    > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Пример #24
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf.")

    parser.add_option("-f", "--id-format", dest="id_format", type="string",
                      help="format for numeric identifier if --as-gtf is set and no name in bed file [%default].")

    parser.set_defaults(as_gtf=False,
                        id_format="%08i",
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(options.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Пример #25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-q",
                      "--query",
                      dest="query",
                      type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t",
                      "--target",
                      dest="target",
                      type="string",
                      help="sequence to use for target [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(
        genome_file=None,
        query=None,
        target=None,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    psl = Blat.Match()

    for bed in Bed.iterator(options.stdin):

        ninput += 1

        start, end = bed.start, bed.end

        if "blockSizes" in bed:
            psl.mQueryId = bed["name"]
            blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]]
            sbjctblockstarts = [
                int(x) + start for x in bed["blockStarts"].split(",")[:-1]
            ]
            strand = bed["strand"]
        else:
            psl.mQueryId = "%i" % ninput
            blocksizes = [end - start]
            sbjctblockstarts = [
                start,
            ]

            strand = "+"

        psl.mSbjctId = bed.contig
        psl.mSbjctFrom, psl.mSbjctTo = start, end
        psl.mQueryFrom, psl.mQueryTo = 0, end - start

        psl.mBlockSizes = blocksizes
        psl.mNBlocks = len(blocksizes)
        psl.strand = strand
        q, qp = [], 0
        for x in blocksizes:
            q.append(qp)
            qp += x

        psl.mQueryBlockStarts = q
        psl.mSbjctBlockStarts = sbjctblockstarts
        psl.mQueryLength = sum(psl.mBlockSizes)
        if fasta:
            psl.mSbjctLength = fasta.getLength(bed.contig)

        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
Пример #26
0
def loadIntervals(infile, outfile):
    '''load intervals from :term:`bed` formatted files into
    the database.

    If a :term:`bam` file is associated with a :term:`bed`
    file, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned.

       nprobes: number of reads in interval
       peakcenter: position with maximum number of reads in interval
       avgval: average coverage within interval
    '''

    tmpfile = P.getTempFile(".")

    headers = ("avgval", "disttostart", "genelist", "length", "peakcenter",
               "peakval", "position", "interval_id", "npeaks", "nprobes",
               "contig", "start", "end", "score", "strand")

    tmpfile.write("\t".join(headers) + "\n")

    (avgval, contig, disttostart, end, genelist,
     length, peakcenter, peakval, position,
     start, interval_id, npeaks, nprobes) = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0

    track = Sample(filename=P.snip(infile, ".bed.gz"))

    bamfiles, offsets = getAssociatedBAMFiles(track)

    if bamfiles:
        E.info("%s: associated bamfiles = %s" % (track, bamfiles))
    else:
        E.info("%s: no bamfiles associated" % (track))

    # open all bamfiles
    samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles]

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        try:
            strand = bed["strand"]
        except IndexError:
            strand = "."

        # The fifth field of a bed file can be used to supply a
        # score. Our iterator returns the optional fields as a "fields
        # array". The first of these is the interval name, and the
        # second the score. The score may be more is better or less is
        # better.
        if len(bed.fields) > 1:
            value = bed.fields[1]
            if value != "":
                score = value
            else:
                score = 1
        else:
            score = 1

        if samfiles:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelinePeakcalling.countPeaks(
                    bed.contig,
                    bed.start,
                    bed.end,
                    samfiles,
                    offsets)
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            # deal with bed12
            bed_intervals = bed.toIntervals()
            length = sum([e - s for s, e in bed_intervals])
            mid_point = length / 2
            for s, e in bed_intervals:
                peakcenter = s + mid_point
                if peakcenter >= e:
                    mid_point = peakcenter - e
                else:
                    break

            npeaks, avgval, peakval, nprobes = \
                (1,
                 1,
                 1,
                 1)

        c.output += 1
        tmpfile.write("\t".join(
            map(str, (avgval, disttostart, genelist, length, peakcenter,
                      peakval, position, bed.name, npeaks, nprobes, bed.contig,
                      bed.start, bed.end, score, strand))) + "\n")

    if c.output == 0:
        E.warn("%s - no aggregate intervals")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           tablename=os.path.basename("%s_intervals" % track.asTable()),
           options="--allow-empty-file "
           "--add-index=interval_id")

    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Пример #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-b", "--bin-size", dest="bin_size", type="string",
                      help="bin size.")

    parser.add_option("--min-value", dest="min_value", type="float",
                      help="minimum value for histogram.")

    parser.add_option(
        "--max-value", dest="max_value", type="float",
        help="maximum value for histogram.")

    parser.add_option(
        "--no-empty-bins", dest="no_empty_bins", action="store_true",
        help="do not display empty bins.")

    parser.add_option(
        "--with-empty-bins", dest="no_empty_bins", action="store_false",
        help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range", dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value", dest="missing_value", type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins", dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method", dest="methods", type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section", dest="output_section", type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.openOutputFile("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.openOutputFile("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.openOutputFile("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.openOutputFile("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.openOutputFile("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.openOutputFile("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput,
            len(values_between),
            len(values_within),
            noverlaps))

    E.Stop()
Пример #28
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-a",
                      "--aggregate-by",
                      dest="aggregate",
                      type="choice",
                      choices=("name", "contig", "track", "none"),
                      help="aggregate counts by feature [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentages [default=%default].")

    parser.set_defaults(
        genome_file=None,
        aggregate="none",
        add_percent=False,
    )

    (options, args) = E.Start(parser, argv)

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        if options.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if options.add_percent and not options.aggregate == "contig":
        raise NotImplementedError(
            "--add-percent option requires --aggregate=contig")

    counts = collections.defaultdict(Counter)
    total = Counter()
    output_totals = True

    if options.aggregate == "track":
        keyf = lambda x: x.track
    elif options.aggregate == "name":
        keyf = lambda x: x.name
    elif options.aggregate == "contig":
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"
        output_totals = False

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add(bed)
        total.add(bed)

    outf = options.stdout

    key = "track"
    if options.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in sorted(counts.items()):
        if options.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    if output_totals:
        if options.add_percent:
            count.setSize(total_bases)
        outf.write("%s\t%s\n" % ("total", str(total)))
    E.Stop()
Пример #29
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-b",
        "--bam-file",
        dest="bam_files",
        type="string",
        help="filename with read mapping information. Multiple files can be "
        "submitted in a comma-separated list [default=%default].")

    parser.add_option(
        "--control-bam-file",
        dest="control_bam_files",
        type="string",
        help="filename with read mapping information for input/control. "
        "Multiple files can be submitted in a comma-separated list "
        "[default=%default].")

    parser.add_option("--filename-format",
                      dest="filename_format",
                      type="choice",
                      choices=("bed", "gff", "gtf"),
                      help="format of secondary stream [default=%default].")

    parser.add_option("-c",
                      "--counter",
                      dest="counters",
                      type="choice",
                      action="append",
                      choices=("length", "overlap", "peaks", "composition-na",
                               "composition-cpg", "classifier-chipseq",
                               "motif"),
                      help="select counters to apply [default=%default].")

    parser.add_option("--motif-sequence",
                      dest="motif_sequence",
                      type="string",
                      help="specify a sequence to search for"
                      "[default=%default].")

    parser.add_option(
        "-o",
        "--offset",
        dest="offsets",
        type="int",
        action="append",
        help="tag offsets for tag counting - supply as many as there "
        "are bam-files [default=%default].")

    parser.add_option(
        "--control-offset",
        dest="control_offsets",
        type="int",
        action="append",
        help="control tag offsets for tag counting - supply as many as "
        "there are bam-files [default=%default].")

    parser.add_option(
        "-a",
        "--output-all-fields",
        dest="all_fields",
        action="store_true",
        help="output all fields in original bed file, by default only "
        "the first 4 are output [default=%default].")

    parser.add_option(
        "--output-bed-headers",
        dest="bed_headers",
        type="string",
        help="supply ',' separated list of headers for bed component "
        "[default=%default].")

    parser.add_option(
        "-f",
        "--gff-file",
        dest="filename_gff",
        type="string",
        action="append",
        metavar='bed',
        help="filename with extra gff files. The order is important "
        "[default=%default].")

    parser.add_option(
        "--has-header",
        dest="has_header",
        action="store_true",
        help="bed file with headers. Headers and first columns are "
        "preserved [default=%default]")

    parser.set_defaults(genome_file=None,
                        counters=[],
                        bam_files=None,
                        offsets=[],
                        control_bam_files=None,
                        control_offsets=[],
                        all_fields=False,
                        filename_format=None,
                        bed_headers=None,
                        filename_gff=[],
                        has_header=False,
                        motif_sequence=None)

    (options, args) = E.Start(parser)

    if options.bed_headers is not None:
        bed_headers = [x.strip() for x in options.bed_headers.split(",")]
        if len(bed_headers) < 3:
            raise ValueError("a bed file needs at least three columns")
    else:
        bed_headers = None

    if options.has_header:
        while 1:
            line = options.stdin.readline()
            if not line:
                E.warn("empty bed file with no header")
                E.Stop()
                return
            if not line.startswith("#"):
                break
        bed_headers = line[:-1].split("\t")

    if "motif" in options.counters and not options.motif_sequence:
        raise ValueError("if using motif must specify a motif-sequence")

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.bam_files:
        bam_files = []
        for bamfile in options.bam_files.split(","):
            bam_files.append(pysam.Samfile(bamfile, "rb"))
    else:
        bam_files = None

    if options.control_bam_files:
        control_bam_files = []
        for bamfile in options.control_bam_files.split(","):
            control_bam_files.append(pysam.Samfile(bamfile, "rb"))
    else:
        control_bam_files = None

    counters = []

    for c in options.counters:
        if c == "length":
            counters.append(CounterLength(fasta=fasta, options=options))

        elif c == "overlap":
            counters.append(
                CounterOverlap(filename=options.filename_gff[0],
                               fasta=fasta,
                               options=options))
            del options.filename_gff[0]
        elif c == "peaks":
            counters.append(
                CounterPeaks(bam_files,
                             options.offsets,
                             control_bam_files,
                             options.control_offsets,
                             options=options))
        elif c == "composition-na":
            counters.append(
                CounterCompositionNucleotides(fasta=fasta, options=options))
        elif c == "composition-cpg":
            counters.append(CounterCompositionCpG(fasta=fasta,
                                                  options=options))
        elif c == "classifier-chipseq":
            counters.append(
                ClassifierChIPSeq(filename_gff=options.filename_gff,
                                  fasta=fasta,
                                  options=options,
                                  prefix=None))
            del options.filename_gff[0]

        elif c == "motif":
            counters.append(
                CounterMotif(fasta=fasta, motif=options.motif_sequence))

    extra_fields = None

    for bed in Bed.iterator(options.stdin):

        if extra_fields is None:

            # output explicitely given headers
            if bed_headers:
                if len(bed_headers) > bed.columns:
                    raise ValueError(
                        "insufficient columns (%i, expected %i) in %s" %
                        (bed.columns, len(bed_headers), str(bed)))

            else:
                bed_headers = Bed.Headers[:bed.columns]

            options.stdout.write("\t".join(bed_headers))
            options.stdout.write("\t" +
                                 "\t".join([x.getHeader()
                                            for x in counters]) + "\n")

            extra_fields = list(range(len(bed_headers) - 3))

        for counter in counters:
            counter.update(bed)

        if options.all_fields:
            options.stdout.write(str(bed))
        else:
            options.stdout.write("\t".join(
                [bed.contig, str(bed.start),
                 str(bed.end)] + [bed.fields[x] for x in extra_fields]))
        for counter in counters:
            options.stdout.write("\t%s" % str(counter))

        options.stdout.write("\n")

    E.Stop()
Пример #30
0
def annotateCpGIslands(infiles, outfile):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex(IOTools.openFile(cpgfile))

    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n")

    for tss in Bed.iterator(IOTools.openFile(tssfile)):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - \
                extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - \
                extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find(start, end))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start
            else:
                relative_start = tss.end - genome_end

            relative_end = relative_start + l

            outf.write("\t".join(
                map(str, (tss.name, tss.strand, genome_start, genome_end,
                          relative_start, relative_end))) + "\n")
            c.matches_output += 1

    outf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Пример #31
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("igv", ),
                      help="method to create plots with [%default]")

    parser.add_option("-d",
                      "--snapshot-dir",
                      dest="snapshotdir",
                      type="string",
                      help="directory to save snapshots in [%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("png", "eps", "svg"),
                      help="output file format [%default]")

    parser.add_option("-o",
                      "--host",
                      dest="host",
                      type="string",
                      help="host that IGV is running on [%default]")

    parser.add_option("-p",
                      "--port",
                      dest="port",
                      type="int",
                      help="port that IGV listens at [%default]")

    parser.add_option(
        "-e",
        "--extend",
        dest="extend",
        type="int",
        help="extend each interval by a number of bases [%default]")

    parser.add_option("-x",
                      "--expand",
                      dest="expand",
                      type="float",
                      help="expand each region by a certain factor [%default]")

    parser.set_defaults(
        method="igv",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    E.info("connection to session on %s:%s" % (options.host, options.port))

    E.info("saving images in %s" % options.snapshotdir)
    igv = IGV.IGV(host=options.host,
                  port=options.port,
                  snapshot_dir=os.path.abspath(options.snapshotdir))

    c = E.Counter()
    for bed in Bed.iterator(options.stdin):

        c.input += 1

        # IGV can not deal with white-space in filenames
        name = re.sub("\s", "_", bed.name)

        E.info("going to %s:%i-%i for %s" %
               (bed.contig, bed.start, bed.end, name))

        start, end = bed.start, bed.end
        extend = options.extend
        if options.expand:
            d = end - start
            extend = max(extend, (options.expand * d - d) // 2)

        start -= extend
        end += extend

        igv.go("%s:%i-%i" % (bed.contig, start, end))

        fn = "%s.%s" % (name, options.format)
        E.info("writing snapshot to '%s'" % fn)
        igv.save(fn)

        c.snapshots += 1

    E.info(c)
    E.Stop()
Пример #32
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--new-instance",
                      dest="new_instance",
                      action="store_true",
                      help="create a new IGV instance [%default]")

    parser.add_option("-s",
                      "--session",
                      dest="session",
                      type="string",
                      help="load session before creating plots "
                      "[%default]")

    parser.add_option("-d",
                      "--snapshot-dir",
                      dest="snapshotdir",
                      type="string",
                      help="directory to save snapshots in [%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("png", "eps", "svg"),
                      help="output file format [%default]")

    parser.add_option("-o",
                      "--host",
                      dest="host",
                      type="string",
                      help="host that IGV is running on [%default]")

    parser.add_option("-p",
                      "--port",
                      dest="port",
                      type="int",
                      help="port that IGV listens at [%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend each interval by a number of bases "
                      "[%default]")

    parser.add_option("-x",
                      "--expand",
                      dest="expand",
                      type="float",
                      help="expand each region by a certain factor "
                      "[%default]")

    parser.add_option("--session-only",
                      dest="session_only",
                      action="store_true",
                      help="plot session after opening, "
                      "ignore intervals "
                      "[%default]")

    parser.add_option("--keep",
                      dest="keep_open",
                      action="store_true",
                      help="keep a newly created IGV session open "
                      "[%default]")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    igv_process = None
    if options.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=options.command, port=options.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (options.host, options.port))
    E.info("saving images in %s" % options.snapshotdir)
    igv = IGV.IGV(host=options.host,
                  port=options.port,
                  snapshot_dir=os.path.abspath(options.snapshotdir))

    if options.session:
        E.info('loading session from %s' % options.session)
        igv.load(options.session)
        E.info('loaded session')

    if options.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(options.session), options.format)
        E.info("writing snapshot to '%s'" %
               os.path.join(options.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in Bed.iterator(options.stdin):

            c.input += 1

            # IGV can not deal with white-space in filenames
            name = re.sub("\s", "_", bed.name)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = options.extend
            if options.expand:
                d = end - start
                extend = max(extend, (options.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = E.getOutputFile("%s.%s" % (name, options.format))
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not options.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.stop()
Пример #33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option("-l",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge overlapping bed segments [default=%default].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "workspace"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.set_defaults(
        genome_file=None,
        feature=None,
        remove_random=True,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        merge=False,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section == "annotations":
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    if options.max_length:
        max_length = options.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby(Bed.iterator(options.stdin),
                               key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge(beds)

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue

                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append(nsegments)
                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

            options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join(
                ["%i" % x for x in range(first_segment, nsegments)])))
            E.info("track %s: annotated with %i segments" %
                   (track, nsegments - first_segment))

        ncontigs = len(contigs)
        E.info(
            "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
            (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.Stop()
Пример #34
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string",
        help="filename with genome [default=%default].")

    parser.add_option(
        "-a", "--aggregate-by", dest="aggregate", type="choice",
        choices=("name", "contig", "track", "none"),
        help="aggregate counts by feature [default=%default].")

    parser.add_option(
        "-p", "--add-percent", dest="add_percent", action="store_true",
        help="add percentages [default=%default].")

    parser.set_defaults(
        genome_file=None,
        aggregate="none",
        add_percent=False,
    )

    (options, args) = E.Start(parser, argv)

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        if options.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if options.add_percent and not options.aggregate == "contig":
        raise NotImplementedError(
            "--add-percent option requires --aggregate=contig")

    counts = collections.defaultdict(Counter)
    total = Counter()
    output_totals = True

    if options.aggregate == "track":
        keyf = lambda x: x.track
    elif options.aggregate == "name":
        keyf = lambda x: x.name
    elif options.aggregate == "contig":
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"
        output_totals = False

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add(bed)
        total.add(bed)

    outf = options.stdout

    key = "track"
    if options.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in sorted(counts.items()):
        if options.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    if output_totals:
        if options.add_percent:
            count.setSize(total_bases)
        outf.write("%s\t%s\n" % ("total", str(total)))
    E.Stop()
Пример #35
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-f",
        "--id-format",
        dest="id_format",
        type="string",
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file [%default].")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(options.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
Пример #36
0
def main(argv=None):

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-n",
                      "--per-name",
                      dest="per_name",
                      action="store_true",
                      help="compute counts per name [default=%default].")

    parser.add_option("-c",
                      "--per-contig",
                      dest="per_contig",
                      action="store_true",
                      help="compute counts per contig [default=%default].")

    parser.add_option("-t",
                      "--per-track",
                      dest="per_track",
                      action="store_true",
                      help="compute counts per track [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentages [default=%default].")

    parser.set_defaults(
        genome_file=None,
        per_name=False,
        per_track=False,
        add_percent=False,
    )

    (options, args) = E.Start(parser, argv)

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        if options.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if options.add_percent and not options.per_contig:
        raise NotImplementedError("--add-percent option requires --per-contig")

    counts = collections.defaultdict(Counter)

    if options.per_track:
        keyf = lambda x: x.track
    elif options.per_name:
        keyf = lambda x: x.name
    elif options.per_contig:
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add(bed)

    outf = options.stdout

    key = "track"
    if options.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in counts.iteritems():
        if options.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    E.Stop()
Пример #37
0
def main( argv = None ):
    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]."  )

    parser.add_option("-b", "--bam-file", dest="bam_files", type="string",
                      help="filename with read mapping information. Multiple files can be submitted in a comma-separated list [default=%default]."  )

    parser.add_option( "--control-bam-file", dest="control_bam_files", type="string",
                      help="filename with read mapping information for input/control. Multiple files can be submitted in a comma-separated list [default=%default]."  )

    parser.add_option( "--filename-format", dest="filename_format", type="choice",
                       choices=("bed", "gff", "gtf" ),
                       help="format of secondary stream [default=%default]."  )

    parser.add_option("-c", "--counter", dest="counters", type="choice", action="append",
                      choices=( "overlap", 
                                "peaks", 
                                "composition-na", 
                                "composition-cpg",
                                "classifier-chipseq"),
                      help="select counters to apply [default=%default]."  )

    parser.add_option("-o", "--offset", dest="offsets", type="int", action="append",
                      help="tag offsets for tag counting - supply as many as there are bam-files [default=%default]."  )

    parser.add_option( "--control-offset", dest="control_offsets", type="int", action="append",
                      help="control tag offsets for tag counting - supply as many as there are bam-files [default=%default]."  )

    parser.add_option("-a", "--all-fields", dest="all_fields", action = "store_true",
                      help="output all fields in original bed file, by default only the first 4 are output [default=%default]."  )

    parser.add_option("--bed-headers", dest="bed_headers", type="string",
                      help="supply ',' separated list of headers for bed component [default=%default]."  )

    parser.add_option("-f", "--filename-gff", dest="filename_gff", type="string", action="append", metavar='bed',
                      help="filename with extra gff files. The order is important [default=%default]."  )

    parser.set_defaults(
        genome_file = None,
        counters = [],
        bam_files = None,
        offsets = [],
        control_bam_files = None,
        control_offsets = [],
        all_fields = False,
        filename_format = None,
        bed_headers = "contig,start,end,name",
        filename_gff = [],
        )

    (options, args) = E.Start( parser )

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.bam_files:
        bam_files = []
        for bamfile in options.bam_files.split(","):
            bam_files.append( pysam.Samfile(bamfile, "rb" ) )
    else:
        bam_files = None

    if options.control_bam_files:
        control_bam_files = []
        for bamfile in options.control_bam_files.split(","):
            control_bam_files.append( pysam.Samfile(bamfile, "rb" ) )
    else:
        control_bam_files = None

    counters = []

    for c in options.counters:
        if c == "overlap":
            counters.append( CounterOverlap( filename = options.filename_bed,
                                             fasta=fasta,
                                             options = options) )
        elif c == "peaks":
            counters.append( CounterPeaks( bam_files,
                                           options.offsets,
                                           control_bam_files,
                                           options.control_offsets,
                                           options = options ) )
        elif c == "composition-na":
            counters.append( CounterCompositionNucleotides( fasta=fasta,
                                                            options = options ))
        elif c == "composition-cpg":
            counters.append( CounterCompositionCpG( fasta=fasta,
                                                    options = options ) )
        elif c == "classifier-chipseq":
            counters.append( ClassifierChIPSeq( filename_gff = options.filename_gff,
                                                fasta = fasta,
                                                options = options, 
                                                prefix = None) )


    options.stdout.write( "\t".join( [x.strip() for x in options.bed_headers.split(",") ] ) )

    options.stdout.write( "\t" + "\t".join( 
            [ x.getHeader() for x in counters] ) + "\n" )

    for bed in Bed.iterator(options.stdin):

	for counter in counters: 
            counter.update(bed)

        if options.all_fields:
            options.stdout.write( str(bed) )
        else:
            options.stdout.write( "\t".join( (bed.contig, 
                                              str(bed.start), 
                                              str(bed.end), 
                                              bed.fields[0]) ) )
        for counter in counters: 
            options.stdout.write("\t%s" % str(counter) )

        options.stdout.write("\n")

    E.Stop()
Пример #38
0
def buildResults(bedfile, fg_file, control_file, counter, options):
    '''compute densities and peakshape parameters.'''

    options.stdout.write("\t".join(("contig",
                                    "start",
                                    "end",
                                    "name",
                                    "\t".join(_bam2peakshape.PeakShapeResult._fields))) + "\n")

    if options.window_size:
        # bins are centered at peak-center and then stretching outwards.
        bins = numpy.arange(-options.window_size + options.bin_size // 2,
                            +options.window_size,
                            options.bin_size)

    #contigs = set(pysam_in.references)

    strand_specific = options.strand_specific

    result = []
    c = E.Counter()
    c.input = 0

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        c.input += 1

        # if bed.contig not in contigs:
        #    c.skipped += 1
        #    continue

        if c.input % options.report_step == 0:
            E.info("iteration: %i" % c.input)

        features = counter.countInInterval(
            fg_file,
            bed.contig, bed.start, bed.end,
            window_size=options.window_size,
            bins=bins,
            only_interval=options.only_interval,
            centring_method=options.centring_method)

        if features is None:
            c.skipped += 1
            continue

        if control_file:
            control = counter.countAroundPos(control_file,
                                             bed.contig,
                                             features.peak_center,
                                             bins=features.bins)

        else:
            control = None

        if options.random_shift:
            direction = numpy.random.randint(0, 2)
            if direction:
                pos = features.peak_center + 2 * bins[0]
            else:
                pos = features.peak_center + 2 * bins[-1]
            shifted = counter.countAroundPos(fg_file,
                                             bed.contig,
                                             pos,
                                             bins=features.bins)
        else:
            shifted = None

        if strand_specific and bed.strand == "-":
            features._replace(hist=hist[::-1])
            if control:
                control._replace(hist=hist[::-1])
            if shifted:
                shift._replace(hist=hist[::-1])

        result.append((features, bed, control, shifted))
        c.added += 1

    E.info("interval processing: %s" % c)

    return result, bins
Пример #39
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-k", "--keep-temp", dest="keep_temp",
                      action="store_true",
                      help="do not delete temporary files [%default]")

    parser.add_option("-a", "--filename-bam", dest="filename_bam",
                      metavar="bam", type="string",
                      help="bam-file to use [%default]")

    parser.add_option("-b", "--filename-bed", dest="filename_bed",
                      metavar="bed", type="string",
                      help="bed-file to use [%default]")

    parser.set_defaults(
        min_overlap=0.5,
        keep_temp=False,
        filename_bam=None,
        filename_bed=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.close()
    tmpfilename = tmpfile.name

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name",
        "score", "strand", "thickstart", "thickend", "rgb",
        "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2",
        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # IMS: newer versions of intersectBed have a very high memory
    #     requirement unless passed sorted bed files.
    statement = """intersectBed %(format)s %(filename_bam)s
    -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n)
    -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals()

    E.info("running %s" % statement)
    retcode = E.run(statement)

    if retcode != 0:
        raise ValueError("error while executing statement %s" % statement)

    infile = open(tmpfilename, "r")
    counts_per_alignment = collections.defaultdict(int)

    E.info("counting")

    take_columns = len(data._fields)

    def iter(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iter(infile), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1
    infile.close()

    for key, counts in counts_per_alignment.iteritems():
        options.stdout.write("%s\t%i\n" % (key, counts))

    if not options.keep_temp:
        os.unlink(tmpfilename)

    # write footer and output benchmark information.
    E.Stop()
def readSegments(infile,
                 indexed_workspace,
                 truncate=False,
                 format="gtf",
                 keep_ambiguous=False,
                 remove_overhangs=False):
    """read segments from infile.

    segments not overlapping with indexed_workspace are removed.

    If :attr: truncate is given, segments extending beyond the workspace
    are truncated.

    returns a list of segments for each contig in a dictionary
    """
    counter = E.Counter()

    segments = collections.defaultdict(list)

    def addSegment(contig, start, end, counter):
        if contig in indexed_workspace:
            r = indexed_workspace[contig].find(start, end)
            if not r:
                counter.nskipped += 1
                return
            if len(r) > 1:
                counter.nambiguous += 1
                if not keep_ambiguous:
                    return
            if truncate:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.ntruncated += 1
                    segments[contig].append((rstart, rend))
                    counter.added += 1
            elif remove_overhangs:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.overhangs += 1
                        break
                else:
                    segments[contig].append((start, end))
            else:
                segments[contig].append((start, end))
                counter.added += 1

            counter.nkept += 1

    if format == "gtf":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile))

        for gene in gtf_iterator:
            # get start and end ignoring introns
            # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] )

            contig, coords = gene[0].contig, [(x.start, x.end) for x in gene]
            counter.ninput += 1
            for start, end in coords:
                addSegment(contig, start, end, counter)

    elif format == "bed":
        bed_iterator = Bed.iterator(infile)
        for bed in bed_iterator:
            counter.ninput += 1
            addSegment(bed.contig, bed.start, bed.end, counter)

    E.info("read segments: %s" % str(counter))

    return segments
Пример #41
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    # IMS: new method: extend intervals by set amount
    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("merge", "filter-genome", "bins", "block",
                               "sanitize-genome", "shift", "extend",
                               "filter-names"),
                      help="method to apply [default=%default]")

    parser.add_option("--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins into which to merge (used for "
                      "method `bins) [default=%default]")

    parser.add_option("--bin-edges",
                      dest="bin_edges",
                      type="string",
                      help="bin_edges for binning method [default=%default]")

    parser.add_option(
        "--binning-method",
        dest="binning_method",
        type="choice",
        choices=("equal-bases", "equal-intervals", "equal-range"),
        help="method used for binning (used for method `bins` if no "
        "bin_edges is given) [default=%default]")

    parser.add_option(
        "--merge-distance",
        dest="merge_distance",
        type="int",
        help="distance in bases over which to merge that are not "
        "directly adjacent [default=%default]")

    parser.add_option(
        "--merge-min-intervals",
        dest="merge_min_intervals",
        type="int",
        help="only output merged intervals that are build from at least "
        "x intervals [default=%default]")

    parser.add_option(
        "--merge-by-name",
        dest="merge_by_name",
        action="store_true",
        help="only merge intervals with the same name [default=%default]")

    parser.add_option(
        "--merge-and-resolve-blocks",
        dest="resolve_blocks",
        action="store_true",
        help="When merging bed12 entrys, should blocks be resolved?")

    parser.add_option("--merge-stranded",
                      dest="stranded",
                      action="store_true",
                      help="Only merge intervals on the same strand")

    parser.add_option(
        "--remove-inconsistent-names",
        dest="remove_inconsistent_names",
        action="store_true",
        help="when merging, do not output intervals where the names of "
        "overlapping intervals do not match [default=%default]")

    parser.add_option("--offset",
                      dest="offset",
                      type="int",
                      help="offset for shifting intervals [default=%default]")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="bam-formatted filename with genome.")

    parser.add_option("--filter-names-file",
                      dest="names",
                      type="string",
                      help="list of names to keep. One per line")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent_names=False,
                        resolve_blocks=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    contigs = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.bam_file:
        samfile = pysam.AlignmentFile(options.bam_file)
        contigs = dict(list(zip(samfile.references, samfile.lengths)))

    processor = Bed.iterator(options.stdin)

    for method in options.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(
                processor,
                options.merge_distance,
                by_name=options.merge_by_name,
                min_intervals=options.merge_min_intervals,
                remove_inconsistent=options.remove_inconsistent_names,
                resolve_blocks=options.resolve_blocks,
                stranded=options.stranded)
        elif method == "bins":
            if options.bin_edges:
                bin_edges = list(map(float, options.bin_edges.split(",")))
                # IMS: check bin edges are valid
                if not (len(bin_edges) == options.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than "
                        "number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(
                processor,
                num_bins=options.num_bins,
                method=options.binning_method,
                bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(processor,
                                       contigs,
                                       offset=options.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, options.offset)
        elif method == "filter-names":
            if not options.names:
                raise ValueError("please supply list of names to filter")
            names = [name.strip() for name in open(options.names)]
            processor = filterNames(processor, names)

    noutput = 0
    for bed in processor:
        options.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.Stop()
Пример #42
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.read_map(
            IOTools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Пример #43
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m",
                      "--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode",
                      dest="output_mode",
                      type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length",
                      dest="min_length",
                      type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length",
                      dest="max_length",
                      type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at",
        dest="extend_at",
        type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand",
        dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [
                fasta.getSequence(bed.contig, strand, start, end)
                for start, end in bed.toIntervals()
            ]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals"
              or options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()
Пример #44
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a", "--bam-file", dest="filename_bam",
                      metavar="bam", type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b", "--bed-file", dest="filename_bed",
                      metavar="bed", type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option(
        "-s", "--sort-bed", dest="sort_bed",
        action="store_true",
        help="sort the bed file by chromosomal location before "
        "processing. "
        "[%default]")

    parser.add_option(
        "--assume-sorted", dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals", dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name",
        "score", "strand", "thickstart", "thickend", "rgb",
        "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2",
        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( zcat %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement,
                 return_popen=True,
                 stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(
            iterate(IOTools.force_str(proc.stdout)), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.Stop()
Пример #45
0
def buildResults(bedfile, fg_file, control_file, counter, options):
    '''compute densities and peakshape parameters.'''

    options.stdout.write("\t".join(
        ("contig", "start", "end", "name",
         "\t".join(_bam2peakshape.PeakShapeResult._fields))) + "\n")

    if options.window_size:
        # bins are centered at peak-center and then stretching outwards.
        bins = numpy.arange(-options.window_size + options.bin_size // 2,
                            +options.window_size, options.bin_size)

    #contigs = set(pysam_in.references)

    strand_specific = options.strand_specific

    result = []
    c = E.Counter()
    c.input = 0

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        c.input += 1

        #if bed.contig not in contigs:
        #    c.skipped += 1
        #    continue

        if c.input % options.report_step == 0:
            E.info("iteration: %i" % c.input)

        features = counter.countInInterval(
            fg_file,
            bed.contig,
            bed.start,
            bed.end,
            window_size=options.window_size,
            bins=bins,
            only_interval=options.only_interval,
            centring_method=options.centring_method)

        if control_file:
            control = counter.countAroundPos(control_file,
                                             bed.contig,
                                             features.peak_center,
                                             bins=features.bins)

        else:
            control = None

        if options.random_shift:
            direction = numpy.random.randint(0, 2)
            if direction: pos = features.peak_center + 2 * bins[0]
            else: pos = features.peak_center + 2 * bins[-1]
            shifted = counter.countAroundPos(fg_file,
                                             bed.contig,
                                             pos,
                                             bins=features.bins)
        else:
            shifted = None

        if strand_specific and bed.strand == "-":
            features._replace(hist=hist[::-1])
            if control: control._replace(hist=hist[::-1])
            if shifted: shift._replace(hist=hist[::-1])

        result.append((features, bed, control, shifted))
        c.added += 1

    E.info("interval processing: %s" % c)

    return result, bins
Пример #46
0
def loadIntervalsFromBed(bedfile, track, outfile,
                         bamfiles, offsets):
    '''load intervals from :term:`bed` formatted files into database.

    Re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    '''

    tmpfile = P.getTempFile(".")

    headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position",
               "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end")

    tmpfile.write("\t".join(headers) + "\n")

    avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    mlength = int(PARAMS["calling_merge_min_interval_length"])

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        # remove very short intervals
        if bed.end - bed.start < mlength:
            c.skipped_length += 1
            continue

        if replicates:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelineChipseq.countPeaks(
                    bed.contig, bed.start, bed.end,
                    samfiles, offsets)

            # nreads can be 0 if the intervals overlap only slightly
            # and due to the binning, no reads are actually in the
            # overlap region.  However, most of these intervals should
            # be small and have already be deleted via the
            # merge_min_interval_length cutoff.  do not output
            # intervals without reads.
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            npeaks, peakcenter, length, avgval, peakval, nprobes = (
                1,
                bed.start +
                (bed.end - bed.start) // 2,
                bed.end - bed.start,
                1,
                1,
                1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             ncpgs, ngenes, npeaks, nprobes, npromoters,
             bed.contig, bed.start, bed.end))) + "\n")

    if c.output == 0:
        E.warn("%s - no intervals")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_intervals" % track.asTable()

    statement = '''
    cgat csv2db %(csv2db_options)s
    --allow-empty-file
    --add-index=interval_id
    --table=%(tablename)s
    < %(tmpfilename)s
    > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Пример #47
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gff2histogram.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-b", "--bin-size", dest="bin_size", type="string",
                      help="bin size."  )
    parser.add_option("--min-value", dest="min_value", type="float",
                      help="minimum value for histogram.")
    parser.add_option("--max-value", dest="max_value", type="float",
                      help="maximum value for histogram.")
    parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true",
                      help="do not display empty bins.")
    parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false",
                      help="display empty bins.")
    parser.add_option("--ignore-out-of-range", dest="ignore_out_of_range", action="store_true",
                      help="ignore values that are out of range (as opposed to truncating them to range border.")
    parser.add_option("--missing", dest="missing_value", type="string",
                      help="entry for missing values [%default]." )
    parser.add_option("--dynamic-bins", dest="dynamic_bins", action="store_true",
                      help="each value constitutes its own bin." )
    parser.add_option( "--format", dest="format", type="choice", 
                       choices=( "gff", "gtf", "bed" ),
                       help="input file format [%default].")
    parser.add_option( "--method", dest="methods", type="choice", action="append",
                       choices=( "all", "hist", "stats", "overlaps", "values" ),
                       help="methods to apply [%default].")
    parser.add_option( "--data", dest="data", type="choice",
                       choices=( "all", "size", "distance" ),
                       help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins = True,
        bin_size = None,
        dynamic_bins = False,
        ignore_out_of_range = False,
        min_value = None,
        max_value = None,
        nonull = None,
        missing_value = "na",
        output_filename_pattern="%s",
        methods = [],
        data = "all",
        format = "gff",
        )

    (options, args) = E.Start( parser, add_output_options = True )

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern: options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError( "please provide counting method using --method option" )

    if options.format in ( "gff", "gtf" ):
        gffs = GTF.iterator( options.stdin )
    elif options.format == "bed":
        gffs = Bed.iterator( options.stdin )

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern: 
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.openOutputFile( "overlaps" )
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0,0
    for this in gffs:
        ninput += 1
        values_within.append( this.end - this.start )

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write( "%s\t%s\n" % (str(last), str(this)) )
                values_overlaps.append( min(this.end, last.end) - max(last.start, this.start) )
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append( this.start - last.end )
                # if this.start - last.end < 10: 
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append( 0 )

        last = this

    if "hist" in options.methods:
        outfile = E.openOutputFile( "hist" )
        h_within = Histogram.Calculate( values_within,
                                        no_empty_bins = options.no_empty_bins,
                                        increment = options.bin_size,
                                        min_value = options.min_value,
                                        max_value = options.max_value,
                                        dynamic_bins = options.dynamic_bins,
                                        ignore_out_of_range = options.ignore_out_of_range )

        h_between = Histogram.Calculate( values_between,
                                         no_empty_bins = options.no_empty_bins,
                                         increment = options.bin_size,
                                         min_value = options.min_value,
                                         max_value = options.max_value,
                                         dynamic_bins = options.dynamic_bins,
                                         ignore_out_of_range = options.ignore_out_of_range )

        if "all" == options.data:
            outfile.write( "residues\tsize\tdistance\n" )
            combined_histogram = Histogram.Combine( [h_within, h_between], missing_value = options.missing_value )
            Histogram.Write( outfile, combined_histogram, nonull = options.nonull )        
        elif options.data == "size":
            outfile.write( "residues\tsize\n" )
            Histogram.Write( outfile, h_within, nonull = options.nonull )        
        elif options.data == "distance":
            outfile.write( "residues\tdistance\n" )
            Histogram.Write( outfile, h_between, nonull = options.nonull )        

        outfile.close()

    if "stats" in options.methods:
        outfile = E.openOutputFile( "stats" )
        outfile.write( "data\t%s\n" % Stats.Summary().getHeader() )
        if options.data in ("size", "all"):
            outfile.write( "size\t%s\n" % str(Stats.Summary(values_within)) )
        if options.data in ("distance", "all"):
            outfile.write( "distance\t%s\n" % str(Stats.Summary(values_between)) )
        outfile.close()

    if "values" in options.methods:
        outfile = E.openOutputFile( "distances" )
        outfile.write( "distance\n%s\n" % "\n".join( map(str, values_between) ) )
        outfile.close()
        outfile = E.openOutputFile( "sizes" )
        outfile.write( "size\n%s\n" % "\n".join( map(str, values_within) ) )
        outfile.close()
        outfile = E.openOutputFile( "overlaps" )
        outfile.write( "overlap\n%s\n" % "\n".join( map(str, values_overlaps) ) )
        outfile.close()

    E.info( "ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, 
                                                                len(values_between),
                                                                len(values_within),
                                                                noverlaps) )

    E.Stop()
Пример #48
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
                            usage=globals()["__doc__"])

    # IMS: new method: extend intervals by set amount
    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("merge", "filter-genome", "bins",
                               "block", "sanitize-genome", "shift", "extend"),
                      help="method to apply [default=%default]")

    parser.add_option("--num-bins", dest="num_bins", type="int",
                      help="number of bins into which to merge (used for method `bins) [default=%default]")

    parser.add_option("--bin-edges", dest="bin_edges", type="string",
                      help="bin_edges for binning method [default=%default]")

    parser.add_option("--binning-method", dest="binning_method", type="choice",
                      choices=(
                          "equal-bases", "equal-intervals", "equal-range"),
                      help="method used for binning (used for method `bins` if no bin_edges is given) [default=%default]")

    parser.add_option("--merge-distance", dest="merge_distance", type="int",
                      help="distance in bases over which to merge that are not directly adjacent [default=%default]")

    parser.add_option("--merge-min-intervals", dest="merge_min_intervals", type="int",
                      help="only output merged intervals that are build from at least x intervals [default=%default]")

    parser.add_option("--merge-by-name", dest="merge_by_name", action="store_true",
                      help="only merge intervals with the same name [default=%default]")

    parser.add_option("--remove-inconsistent", dest="remove_inconsistent", action="store_true",
                      help="when merging, do not output intervals where the names of overlapping intervals "
                      "do not match [default=%default]")

    parser.add_option("--offset", dest="offset",  type="int",
                      help="offset for shifting intervals [default=%default]")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-b", "--bam-file", dest="bam_file", type="string",
                      help="bam-formatted filename with genome.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    contigs = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.bam_file:
        samfile = pysam.Samfile(options.bam_file)
        contigs = dict(zip(samfile.references, samfile.lengths))

    processor = Bed.iterator(options.stdin)

    for method in options.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(processor,
                              options.merge_distance,
                              by_name=options.merge_by_name,
                              min_intervals=options.merge_min_intervals,
                              remove_inconsistent=options.remove_inconsistent)
        elif method == "bins":
            if options.bin_edges:
                bin_edges = map(float, options.bin_edges.split(","))
                # IMS: check bin edges are valid
                if not(len(bin_edges) == options.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(processor,
                                                    num_bins=options.num_bins,
                                                    method=options.binning_method,
                                                    bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(
                processor, contigs, offset=options.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, options.offset)

    noutput = 0
    for bed in processor:
        options.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.Stop()
Пример #49
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("--bed-file",
                      dest="infiles",
                      type="string",
                      metavar="bed",
                      help="supply list of bed files",
                      action="append")

    parser.set_defaults(infiles=[])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    options.infiles.extend(args)
    if len(options.infiles) == 0:
        raise ValueError('please provide at least 1 bed file')

    E.info("concatenating bed files")
    # concatenate the list of files
    tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w")
    infs = options.infiles
    for inf in infs:
        for bed in Bed.iterator(IOTools.open_file(inf)):
            tmp.write("%s\n" % bed)
    tmp.close()

    E.info("merging bed entries")
    # merge the bed entries in the file
    name = tmp.name
    tmp_bed = pybedtools.BedTool(name)
    tmp_bed.sort().merge().saveas(tmp_merge.name)
    tmp_merge.close()

    E.info("indexing bed entries")
    # index the bed entries
    merged = IndexedGenome.Simple()
    for bed in Bed.iterator(IOTools.open_file(tmp_merge.name)):
        merged.add(bed.contig, bed.start, bed.end)

    counts = collections.defaultdict(int)
    # list of samples
    samples = options.infiles

    E.info("counting no. samples overlapping each interval")
    for sample in samples:
        found = set()
        for bed in Bed.iterator(IOTools.open_file(sample)):
            if merged.contains(bed.contig, bed.start, bed.end):
                key = [bed.contig] + \
                    [x for x in merged.get(bed.contig, bed.start, bed.end)]
                key = (key[0], key[1][0], key[1][1])
                if key in found:
                    continue
                found.add(key)

                # tuple of interval description as key - (contig, start, end)
                counts[key] += 1

    # open outfile
    options.stdout.write("contig\tstart\tend\tcount\n")

    E.info("outputting result")
    for interval, count in sorted(counts.items()):
        options.stdout.write("\t".join(map(str, interval)) + "\t" +
                             str(count) + "\n")

    # write footer and output benchmark information.
    E.stop()
Пример #50
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option("-n", "--new-instance", dest="new_instance",
                      action="store_true",
                      help="create a new IGV instance [%default]")

    parser.add_option("-s", "--session", dest="session",
                      type="string",
                      help="load session before creating plots "
                      "[%default]")

    parser.add_option("-d", "--snapshot-dir", dest="snapshotdir",
                      type="string",
                      help="directory to save snapshots in [%default]")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("png", "eps", "svg"),
                      help="output file format [%default]")

    parser.add_option("-o", "--host", dest="host", type="string",
                      help="host that IGV is running on [%default]")

    parser.add_option("-p", "--port", dest="port", type="int",
                      help="port that IGV listens at [%default]")

    parser.add_option("-e", "--extend", dest="extend", type="int",
                      help="extend each interval by a number of bases "
                      "[%default]")

    parser.add_option("-x", "--expand", dest="expand", type="float",
                      help="expand each region by a certain factor "
                      "[%default]")

    parser.add_option("--session-only", dest="session_only",
                      action="store_true",
                      help="plot session after opening, "
                      "ignore intervals "
                      "[%default]")

    parser.add_option("--keep", dest="keep_open",
                      action="store_true",
                      help="keep a newly created IGV session open "
                      "[%default]")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    igv_process = None
    if options.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=options.command,
                                   port=options.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (options.host, options.port))
    E.info("saving images in %s" % options.snapshotdir)
    igv = IGV.IGV(host=options.host,
                  port=options.port,
                  snapshot_dir=os.path.abspath(options.snapshotdir))

    if options.session:
        E.info('loading session from %s' % options.session)
        igv.load(options.session)
        E.info('loaded session')

    if options.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(options.session), options.format)
        E.info("writing snapshot to '%s'" %
               os.path.join(options.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in Bed.iterator(options.stdin):

            c.input += 1

            # IGV can not deal with white-space in filenames
            name = re.sub("\s", "_", bed.name)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = options.extend
            if options.expand:
                d = end - start
                extend = max(extend, (options.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = "%s.%s" % (name, options.format)
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not options.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.Stop()
Пример #51
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("-o", "--mode", dest="mode", type="choice",
                      choices=("intervals", "leftright"),
                      help="what to output [%default]")

    parser.add_option("--min-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option("--extend-at", dest="extend_at", type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]")

    parser.add_option("--extend-by", dest="extend_by", type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--use-strand", dest="ignore_strand", action="store_false",
                      help="use strand information and return reverse complement [default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.mode == "intervals":
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()
Пример #52
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()
Пример #53
0
def readSegments(infile, indexed_workspace,
                 truncate=False,
                 format="gtf",
                 keep_ambiguous=False,
                 remove_overhangs=False):
    """read segments from infile.

    segments not overlapping with indexed_workspace are removed.

    If :attr: truncate is given, segments extending beyond the workspace
    are truncated.

    returns a list of segments for each contig in a dictionary
    """
    counter = E.Counter()

    segments = collections.defaultdict(list)

    def addSegment(contig, start, end, counter):
        if contig in indexed_workspace:
            r = indexed_workspace[contig].find(start, end)
            if not r:
                counter.nskipped += 1
                return
            if len(r) > 1:
                counter.nambiguous += 1
                if not keep_ambiguous:
                    return
            if truncate:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.ntruncated += 1
                    segments[contig].append((rstart, rend))
                    counter.added += 1
            elif remove_overhangs:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.overhangs += 1
                        break
                else:
                    segments[contig].append((start, end))
            else:
                segments[contig].append((start, end))
                counter.added += 1

            counter.nkept += 1

    if format == "gtf":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile))

        for gene in gtf_iterator:
            # get start and end ignoring introns
            # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] )

            contig, coords = gene[0].contig, [(x.start, x.end) for x in gene]
            counter.ninput += 1
            for start, end in coords:
                addSegment(contig, start, end, counter)

    elif format == "bed":
        bed_iterator = Bed.iterator(infile)
        for bed in bed_iterator:
            counter.ninput += 1
            addSegment(bed.contig, bed.start, bed.end, counter)

    E.info("read segments: %s" % str(counter))

    return segments
Пример #54
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--min-overlap",
                      dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a",
                      "--bam-file",
                      dest="filename_bam",
                      metavar="bam",
                      type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b",
                      "--bed-file",
                      dest="filename_bed",
                      metavar="bed",
                      type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option("-s",
                      "--sort-bed",
                      dest="sort_bed",
                      action="store_true",
                      help="sort the bed file by chromosomal location before "
                      "processing. "
                      "[%default]")

    parser.add_option(
        "--assume-sorted",
        dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals",
        dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.open_file(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.AlignmentFile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.get_num_lines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.open_file(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name", "score", "strand", "thickstart",
        "thickend", "rgb", "blockcount", "blockstarts", "blockends"
    ][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2", "score2", "strand2",
        "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2",
        "blockends2"
    ][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iterate(
            IOTools.force_str(proc.stdout)),
                                            key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.stop()
Пример #55
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    parser = buildOptionParser(argv)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError(
            "please specify one bam- or wig-file and one bed file")

    if options.control_files:
        E.info("using control files: %s" % ",".join(options.control_files))

    infile, bedfile = args
    control_files = []

    if options.format == "bigwig":
        fg_file = pyBigWig.open(infile)
        for control_file in options.control_files:
            control_files.append(pyBigWig.open(control_file))
        counter = _bam2peakshape.CounterBigwig(
            smooth_method=options.smooth_method)

    elif options.format == "bam":
        fg_file = pysam.AlignmentFile(infile, "rb")
        for control_file in options.control_files:
            control_files.append(pysam.AlignmentFile(control_file, "rb"))
        counter = _bam2peakshape.CounterBam(
            shift=options.shift, smooth_method=options.smooth_method)

    features_per_interval, bins = buildDensityMatrices(
        Bed.iterator(IOTools.open_file(bedfile)),
        fg_file,
        control_files,
        counter,
        window_size=options.window_size,
        bin_size=options.bin_size,
        strand_specific=options.strand_specific,
        centring_method=options.centring_method,
        use_interval=options.use_interval,
        random_shift=options.random_shift,
        smooth_method=options.smooth_method,
        report_step=options.report_step)

    if len(features_per_interval) == 0:
        E.warn("no data - no output")
        E.stop()
        return

    outputFeatureTable(options.stdout, features_per_interval, bins)

    # apply normalization
    # Note: does not normalize control?
    # Needs reworking, currently it does not normalize across
    # all samples nor does the work "sum" reflect the per million
    # normalization.
    if options.normalization == "sum":
        E.info("starting sum normalization")
        # get total counts across all intervals
        norm = 0.0
        for foreground, bed, controls, shifted in features_per_interval:
            norm += sum(foreground.counts)
        # per million
        norm /= float(1000000)
        E.info("sum/million normalization with %f" % norm)

        # normalise
        new_data = []
        for foreground, bed, controls, shifted in features_per_interval:

            foreground = foreground._replace(
                counts=numpy.array(foreground.counts, dtype=numpy.float) /
                norm)
            new_controls = []
            for control in controls:
                new_controls.append(
                    control._replace(
                        counts=numpy.array(control.counts, dtype=numpy.float) /
                        norm))
            if shifted:
                shifted = shifted._replace(
                    counts=numpy.array(shifted.counts, dtype=numpy.float) /
                    norm)
            new_data.append(
                IntervalData._make((foreground, bed, new_controls, shifted)))
        features_per_interval = new_data
    else:
        E.info("no normalization performed")

    # center bins
    out_bins = bins[:-1] + options.bin_size

    # build tracks
    def _toTrack(filename):
        return os.path.splitext(os.path.basename(filename))[0]

    outputMatrices(features_per_interval,
                   out_bins,
                   foreground_track=_toTrack(infile),
                   control_tracks=[_toTrack(x) for x in options.control_files],
                   shifted=options.random_shift,
                   sort_orders=options.sort_orders)

    # write footer and output benchmark information.
    E.stop()
Пример #56
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.Samfile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(
            IOTools.openFile(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.iteritems()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.Samfile(options.filename_transcriptome,
                                            "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.Samfile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.Samfile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.Samfile(options.filename_mismapped,
                                         "wb",
                                         template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.Samfile(options.filename_junctions,
                                          "rb")
    else:
        junctions_samfile = None

    c = _bams2bam.filter(genome_samfile,
                         output_samfile,
                         output_mismapped,
                         transcripts_samfile,
                         junctions_samfile,
                         transcripts,
                         regions=regions_to_remove,
                         unique=options.unique,
                         remove_contigs=options.remove_contigs,
                         colour_mismatches=options.colour_mismatches,
                         ignore_mismatches=options.ignore_mismatches,
                         ignore_transcripts=transcripts_samfile is None,
                         ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.openFile(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.Stop()
Пример #57
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $", 
                             usage = globals()["__doc__"])
    
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-f", "--features", dest="features", type="string", 
                       help="feature to collect [default=None]."  )

    parser.add_option( "-i", "--files", dest="files", action="append",
                       help="use multiple annotations [default=None]."  )

    parser.add_option(  "-a", "--annotations", dest="annotations", type="string", 
                       help="aggregate name for annotations if only single file is provided from STDIN [default=None]."  )

    parser.add_option(  "--input-filename-map", dest="input_filename_map", type="string", 
                       help="filename with a map of gene_ids to categories [default=None]."  )

    parser.add_option( "-l", "--max-length", dest="max_length", type="string", 
                       help="maximum segment length [default=None]."  )

    parser.add_option( "-m", "--merge", dest="merge", action="store_true",
                       help="merge overlapping bed segments [default=%default]."  )

    parser.add_option( "-s", "--section", dest="section", type="choice", 
                       choices=("segments", "annotations", "workspace" ),
                       help="annotator section [default=None]."  )

    parser.add_option( "--subset", dest="subsets", type="string", action="append",
                       help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."  )

    parser.set_defaults(
        genome_file = None,
        feature = None,
        remove_random = True,
        section = "segments",
        annotations = "annotations",
        max_length = 100000,
        files = [],
        subsets = [],
        input_filename_map = None,
        merge = False,
        )

    (options, args) = E.Start( parser )

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) )

    if options.subsets:
        subsets = collections.defaultdict( list )
        for s in options.subsets: 
            filename_gff,label,filename_ids = s.split( "," )
            subsets[filename_gff].append( (label,filename_ids) )
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section == "annotations":
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    if options.max_length:
        max_length = options.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby( Bed.iterator( options.stdin ), key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge( beds )

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue
                
                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append( nsegments )
                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1
                
            options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join( ["%i" % x for x in range(first_segment, nsegments) ] ) ) )
            E.info( "track %s: annotated with %i segments" % (track, nsegments - first_segment) )

        ncontigs = len(contigs)
        E.info( "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.Stop()
Пример #58
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--infiles", dest="infiles", type="string",
                      metavar="bed",
                      action="append", help="supply list of bed files")

    parser.set_defaults(infiles=[])
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    options.infiles.extend(args)
    if len(options.infiles) == 0:
        raise ValueError('please provide at least 1 bed file')

    E.info("concatenating bed files")
    # concatenate the list of files
    tmp = tempfile.NamedTemporaryFile(delete=False)
    tmp_merge = tempfile.NamedTemporaryFile(delete=False)
    infs = options.infiles
    for inf in infs:
        for bed in Bed.iterator(IOTools.openFile(inf)):
            tmp.write("%s\n" % bed)
    tmp.close()

    E.info("merging bed entries")
    # merge the bed entries in the file
    name = tmp.name
    tmp_bed = pybedtools.BedTool(name)
    tmp_bed.merge().saveas(tmp_merge.name)
    tmp_merge.close()

    E.info("indexing bed entries")
    # index the bed entries
    merged = IndexedGenome.Simple()
    for bed in Bed.iterator(IOTools.openFile(tmp_merge.name)):
        merged.add(bed.contig, bed.start, bed.end)

    counts = collections.defaultdict(int)
    # list of samples
    samples = options.infiles

    E.info("counting no. samples overlapping each interval")
    for sample in samples:
        found = set()
        for bed in Bed.iterator(IOTools.openFile(sample)):
            if merged.contains(bed.contig, bed.start, bed.end):
                key = [bed.contig] + \
                    [x for x in merged.get(bed.contig, bed.start, bed.end)]
                key = (key[0], key[1][0], key[1][1])
                if key in found:
                    continue
                found.add(key)

                # tuple of interval description as key - (contig, start, end)
                counts[key] += 1

    # open outfile
    options.stdout.write("contig\tstart\tend\tcount\n")

    E.info("outputting result")
    for interval, count in counts.iteritems():
        options.stdout.write(
            "\t".join(map(str, interval)) + "\t" + str(count) + "\n")

    # write footer and output benchmark information.
    E.Stop()
Пример #59
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-k", "--keep-temp", dest="keep_temp", action="store_true",
                      help="do not delete temporary files [%default]")

    parser.add_option("-a", "--filename-bam", dest="filename_bam", metavar="bam", type="string",
                      help="bam-file to use [%default]")

    parser.add_option("-b", "--filename-bed", dest="filename_bed", metavar="bam", type="string",
                      help="bed-file to use [%default]")

    parser.set_defaults(
        min_overlap=0.5,
        keep_temp=False,
        filename_bam=None,
        filename_bed=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.close()
    tmpfilename = tmpfile.name

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = ["contig", "start", "end", "name",
                   "score", "strand", "thickstart", "thickend", "rgb",
                   "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend(["contig2", "start2", "end2", "name2",
                        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
                        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # IMS: newer versions of intersectBed have a very high memory requirement unless
    #     passed sorted bed files.
    statement = """intersectBed %(format)s %(filename_bam)s -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n) -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals()

    E.info("running %s" % statement)
    retcode = E.run(statement)

    if retcode != 0:
        raise ValueError("error while executing statement %s" % statement)

    infile = open(tmpfilename, "r")
    counts_per_alignment = collections.defaultdict(int)

    E.info("counting")

    take_columns = len(data._fields)

    def iter(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iter(infile), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1
    infile.close()

    for key, counts in counts_per_alignment.iteritems():
        options.stdout.write("%s\t%i\n" % (key, counts))

    if not options.keep_temp:
        os.unlink(tmpfilename)

    # write footer and output benchmark information.
    E.Stop()