Exemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o", "--output-section", dest="output", type=str,
                        choices=("full", "name"),
                        help="output either ``full`` overlapping entries, only the ``name``s.")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two arguments required")

    if unknown[0] == "-":
        infile1 = args.stdin
    else:
        infile1 = iotools.open_file(unknown[0], "r")

    infile2 = iotools.open_file(unknown[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = args.output
    outfile = args.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.stop()
Exemplo n.º 2
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))
    outs = iotools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
Exemplo n.º 3
0
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(iotools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
Exemplo n.º 4
0
def transcript2bed12(transcript):

    new_entry = Bed.Bed()
    start = min(entry.start for entry in transcript)
    end = max(entry.end for entry in transcript)

    try:
        thickStart = min(entry.start for entry in transcript
                         if entry.feature == "CDS")
        thickEnd = max(entry.end for entry in transcript
                       if entry.feature == "CDS")
    except ValueError:

        # if there is no CDS, then set first base of transcript as
        # start

        if transcript[0].strand == "-":
            thickStart = end
            thickEnd = end
        else:
            thickStart = start
            thickEnd = start

    exons = GTF.asRanges(transcript, "exon")

    exon_starts = [es - start for (es, ee) in exons]
    exon_lengths = [ee - es for (es, ee) in exons]
    exon_count = len(exons)
    new_entry.contig = transcript[0].contig
    new_entry.start = start
    new_entry.end = end
    new_entry["strand"] = transcript[0].strand
    new_entry["name"] = transcript[0].transcript_id

    new_entry["thickStart"] = thickStart
    new_entry["thickEnd"] = thickEnd

    new_entry["blockCount"] = exon_count
    new_entry["blockStarts"] = ",".join(map(str, exon_starts))
    new_entry["blockSizes"] = ",".join(map(str, exon_lengths))

    return new_entry
Exemplo n.º 5
0
    def __init__(self, filename, *args, **kwargs):

        assert filename is not None,\
            "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs)

        self.filename = filename

        E.info("reading intervals from %s" % self.filename)

        self.index = Bed.readAndIndex(iotools.open_file(self.filename, "r"),
                                      per_track=True)

        E.info("read intervals for %s tracks" % len(self.index))

        self.tracks = list(self.index.keys())
        self.headers = []
        for track in self.tracks:
            self.headers.extend(["%s_nover" % track, "%s_bases" % track])
Exemplo n.º 6
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = iotools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=iotools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
Exemplo n.º 7
0
    def _count(self, filename, idx):
        '''count filename against idx.'''

        overlapping_genes = set()
        genes = set()

        # iterate over exons
        infile = iotools.open_file(filename, "r")
        it = Bed.bed_iterator(infile)

        nexons, nexons_overlapping = 0, 0
        nbases, nbases_overlapping = 0, 0
        for this in it:
            nexons += 1
            nbases += this.end - this.start

            try:
                intervals = list(idx[this.contig].find(max(0, this.start),
                                                       this.end))
            except KeyError:
                continue
            except Exception as msg:
                raise Exception("error while processing %s, msg=%s" %
                                (filename, msg))
            if len(intervals) == 0:
                continue

            nexons_overlapping += 1
            start, end = this.start, this.end
            counts = numpy.zeros(end - start, numpy.int)
            for other_start, other_end, other_value in intervals:
                for x in range(
                        max(start, other_start) - start,
                        min(end, other_end) - start):
                    counts[x] += 1
            nbases_overlapping += sum([1 for x in counts if x > 0])

        infile.close()

        return nexons, nexons_overlapping, nbases, nbases_overlapping
Exemplo n.º 8
0
def getExonLocations(filename):
    '''return a list of exon locations as Bed entries
    from a file contain a one ensembl gene ID per line
    '''
    fh = iotools.openFile(filename, "r")
    ensembl_ids = []
    for line in fh:
        ensembl_ids.append(line.strip())
    fh.close()

    dbhandle = sqlite3.connect(PARAMS["annotations_database"])
    cc = dbhandle.cursor()

    gene_ids = []
    n_ids = 0
    for ID in ensembl_ids:
        gene_ids.append('gene_id="%s"' % ID)
        n_ids += 1

    statement = "select contig,start,end from geneset_cds_gtf where " + \
        " OR ".join(gene_ids)

    cc.execute(statement)

    region_list = []
    n_regions = 0
    for result in cc:
        b = Bed.Bed()
        b.contig, b.start, b.end = result
        region_list.append(b)
        n_regions += 1

    cc.close()

    E.info("Retrieved exon locations for %i genes. Got %i regions" %
           (n_ids, n_regions))

    return(region_list)
Exemplo n.º 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("--bed-file",
                        dest="infiles",
                        type=str,
                        metavar="bed",
                        help="supply list of bed files",
                        action="append")

    parser.set_defaults(infiles=[])

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    args.infiles.extend(unknown)
    if len(args.infiles) == 0:
        raise ValueError('please provide at least 1 bed file')

    E.info("concatenating bed files")
    # concatenate the list of files
    tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w")
    infs = args.infiles
    for inf in infs:
        for bed in Bed.iterator(iotools.open_file(inf)):
            tmp.write("%s\n" % bed)
    tmp.close()

    E.info("merging bed entries")
    # merge the bed entries in the file
    name = tmp.name
    tmp_bed = pybedtools.BedTool(name)
    tmp_bed.sort().merge().saveas(tmp_merge.name)
    tmp_merge.close()

    E.info("indexing bed entries")
    # index the bed entries
    merged = IndexedGenome.Simple()
    for bed in Bed.iterator(iotools.open_file(tmp_merge.name)):
        merged.add(bed.contig, bed.start, bed.end)

    counts = collections.defaultdict(int)
    # list of samples
    samples = args.infiles

    E.info("counting no. samples overlapping each interval")
    for sample in samples:
        found = set()
        for bed in Bed.iterator(iotools.open_file(sample)):
            if merged.contains(bed.contig, bed.start, bed.end):
                key = [bed.contig] + \
                    [x for x in merged.get(bed.contig, bed.start, bed.end)]
                key = (key[0], key[1][0], key[1][1])
                if key in found:
                    continue
                found.add(key)

                # tuple of interval description as key - (contig, start, end)
                counts[key] += 1

    # open outfile
    args.stdout.write("contig\tstart\tend\tcount\n")

    E.info("outputting result")
    for interval, count in sorted(counts.items()):
        args.stdout.write("\t".join(map(str, interval)) + "\t" + str(count) +
                          "\n")

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--no-empty-bins",
                      dest="no_empty_bins",
                      action="store_true",
                      help="do not display empty bins.")

    parser.add_option("--with-empty-bins",
                      dest="no_empty_bins",
                      action="store_false",
                      help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins",
                      dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format",
                      dest="format",
                      type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section",
                      dest="output_section",
                      type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.open_output_file("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.open_output_file("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.open_output_file("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.open_output_file("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.open_output_file("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.open_output_file("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput, len(values_between), len(values_within), noverlaps))

    E.stop()
Exemplo n.º 11
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-a",
                        "--as-gtf",
                        dest="as_gtf",
                        action="store_true",
                        help="output as gtf.")

    parser.add_argument(
        "-f",
        "--id-format",
        dest="id_format",
        type=str,
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file .")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (args) = E.start(parser, add_pipe_options=True)

    as_gtf = args.as_gtf
    id_format = args.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(args.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        args.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
Exemplo n.º 12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        help="feature to collect.")

    parser.add_argument("-i",
                        "--files",
                        dest="files",
                        action="append",
                        help="use multiple annotations.")

    parser.add_argument(
        "-a",
        "--annotations",
        dest="annotations",
        type=str,
        help=
        "aggregate name for annotations if only single file is provided from STDIN."
    )

    parser.add_argument("--map-tsv-file",
                        dest="input_filename_map",
                        type=str,
                        help="filename with a map of gene_ids to categories.")

    parser.add_argument("-l",
                        "--max-length",
                        dest="max_length",
                        type=str,
                        help="maximum segment length.")

    parser.add_argument("-m",
                        "--merge-overlapping",
                        dest="merge",
                        action="store_true",
                        help="merge overlapping bed segments.")

    parser.add_argument("-s",
                        "--section",
                        dest="section",
                        type=str,
                        choices=("segments", "annotations", "workspace"),
                        help="annotator section.")

    parser.add_argument(
        "--subset",
        dest="subsets",
        type=str,
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids."
    )

    parser.set_defaults(
        genome_file=None,
        feature=None,
        remove_random=True,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        merge=False,
    )

    (args, unknown) = E.start(parser, unknowns=True)

    args.files += unknown
    if len(args.files) == 0:
        args.files.append("-")
    args.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in args.files]))

    if args.subsets:
        subsets = collections.defaultdict(list)
        for s in args.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        args.subsets = subsets

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.section == "segments":
        prefix = "##Segs"
    elif args.section == "annotations":
        prefix = "##Id"
    elif args.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % args.section)

    if args.max_length:
        max_length = args.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if args.section in ("annotations"):
        contigs = set()
        it = itertools.groupby(Bed.iterator(args.stdin),
                               key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if args.merge:
                beds = Bed.merge(beds)

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if args.remove_random and "random" in contig:
                    continue

                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append(nsegments)
                args.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                  (prefix, nsegments, contig, start, end))
                nsegments += 1

            args.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join(
                ["%i" % x for x in range(first_segment, nsegments)])))
            E.info("track %s: annotated with %i segments" %
                   (track, nsegments - first_segment))

        ncontigs = len(contigs)
        E.info(
            "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
            (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.stop()
Exemplo n.º 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-m",
                        "--min-overlap",
                        dest="min_overlap",
                        type=float,
                        help="minimum overlap ")

    parser.add_argument("-a",
                        "--bam-file",
                        dest="filename_bam",
                        metavar="bam",
                        type=str,
                        help="bam-file to use (required) ")

    parser.add_argument("-b",
                        "--bed-file",
                        dest="filename_bed",
                        metavar="bed",
                        type=str,
                        help="bed-file to use (required) ")

    parser.add_argument(
        "-s",
        "--sort-bed",
        dest="sort_bed",
        action="store_true",
        help="sort the bed file by chromosomal location before "
        "processing. ")

    parser.add_argument(
        "--assume-sorted",
        dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. ")

    parser.add_argument(
        "--split-intervals",
        dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. ")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    filename_bam = args.filename_bam
    filename_bed = args.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(unknown) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = unknown

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = args.min_overlap

    args.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(iotools.open_file(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.AlignmentFile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = iotools.get_num_lines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(iotools.open_file(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name", "score", "strand", "thickstart",
        "thickend", "rgb", "blockcount", "blockstarts", "blockends"
    ][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2", "score2", "strand2",
        "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2",
        "blockends2"
    ][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    args.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if args.sort_bed:
        bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if args.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iterate(
            iotools.force_str(proc.stdout)),
                                            key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        args.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 14
0
def loadIntervals(infile, outfile):
    '''load intervals from :term:`bed` formatted files into
    the database.

    If a :term:`bam` file is associated with a :term:`bed`
    file, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned.

       nprobes: number of reads in interval
       peakcenter: position with maximum number of reads in interval
       avgval: average coverage within interval
    '''

    tmpfile = P.get_temp_file(".")

    headers = ("avgval", "disttostart",
               "genelist", "length",
               "peakcenter", "peakval",
               "position", "interval_id",
               "npeaks", "nprobes",
               "contig", "start", "end", "score", "strand")

    tmpfile.write("\t".join(headers) + "\n")

    (avgval, contig, disttostart, end, genelist,
     length, peakcenter, peakval, position,
     start, interval_id, npeaks, nprobes) = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0

    track = Sample(filename=P.snip(infile, ".bed.gz"))

    bamfiles, offsets = getAssociatedBAMFiles(track)

    if bamfiles:
        E.info("%s: associated bamfiles = %s" % (track, bamfiles))
    else:
        E.info("%s: no bamfiles associated" % (track))

    # open all bamfiles
    samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles]

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.open_file(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        try:
            strand = bed["strand"]
        except IndexError:
            strand = "."
            
        # The fifth field of a bed file can be used to supply a
        # score. Our iterator returns the optional fields as a "fields
        # array". The first of these is the interval name, and the
        # second the score. The score may be more is better or less is
        # better.
        if len(bed.fields) > 1:
            value = bed.fields[1]
            if value != "":
                score = value
            else:
                score = 1
        else:
            score = 1

        if samfiles:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelinePeakcalling.countPeaks(
                    bed.contig,
                    bed.start,
                    bed.end,
                    samfiles,
                    offsets)
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            # deal with bed12
            bed_intervals = bed.toIntervals()
            length = sum([e - s for s, e in bed_intervals])
            mid_point = length / 2
            for s, e in bed_intervals:
                peakcenter = s + mid_point
                if peakcenter >= e:
                    mid_point = peakcenter - e
                else:
                    break

            npeaks, avgval, peakval, nprobes = \
                (1,
                 1,
                 1,
                 1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             npeaks, nprobes,
             bed.contig, bed.start, bed.end, score, strand))) + "\n")

    if c.output == 0:
        E.warn("%s - no aggregate intervals")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           tablename=os.path.basename("%s_intervals" % track.asTable()),
           options="--allow-empty-file "
           "--add-index=interval_id")

    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
from cgatcore import expriment as E

parser = E.OptionParser(version="%prog version: $1.0$",
                           usage=globals()["__doc__"])
parser.add_option("-g", "--genome", dest="genome",
                  help="index fasta genome sequence")
parser.add_option("-O", "--per-utron-out", dest="outfile",
                  help="File name for output file that will contain one row"
                       "per entry in the input")
options, args = E.start(parser, sys.argv)
                  
genome = IndexedFasta.IndexedFasta(options["genome"])


bedfile = Bed.iterator(options.stdin)
splice_site_dict = dict()
outfile = iotools.open_file(options["outfile"], "w")
outfile.write("\t".join("transcript_id",
                        "strand",
                        "ss5",
                        "ss3",
                        "contig",
                        "splice_site_start",
                        "splice_site_end",
                        "utron_size"))
for utron in bedfile:
    
    ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2)
    ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end)
    if utron.strand == "+":
Exemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = iotools.read_map(
            iotools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(iotools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(iotools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = iotools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 17
0
 def buildIndex(self, filename):
     return Bed.readAndIndex(iotools.open_file(filename, "r"))
Exemplo n.º 18
0
 def __init__(self, filename):
     self.mIndices = Bed.readAndIndex(iotools.open_file(filename, "r"),
                                      per_track=True)
Exemplo n.º 19
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument(
        "-b",
        "--bam-file",
        dest="bam_files",
        type=str,
        help="filename with read mapping information. Multiple files can be "
        "submitted in a comma-separated list.")

    parser.add_argument(
        "--control-bam-file",
        dest="control_bam_files",
        type=str,
        help="filename with read mapping information for input/control. "
        "Multiple files can be submitted in a comma-separated list ")

    parser.add_argument("--filename-format",
                        dest="filename_format",
                        type=str,
                        choices=("bed", "gff", "gtf"),
                        help="format of secondary stream.")

    parser.add_argument("-c",
                        "--counter",
                        dest="counters",
                        type=str,
                        action="append",
                        choices=("length", "overlap", "peaks",
                                 "composition-na", "composition-cpg",
                                 "classifier-chipseq", "motif"),
                        help="select counters to apply.")

    parser.add_argument("--motif-sequence",
                        dest="motif_sequence",
                        type=str,
                        help="specify a sequence to search for")

    parser.add_argument(
        "-o",
        "--offset",
        dest="offsets",
        type=int,
        action="append",
        help="tag offsets for tag counting - supply as many as there "
        "are bam-files")

    parser.add_argument(
        "--control-offset",
        dest="control_offsets",
        type=int,
        action="append",
        help="control tag offsets for tag counting - supply as many as "
        "there are bam-files.")

    parser.add_argument(
        "-a",
        "--output-all-fields",
        dest="all_fields",
        action="store_true",
        help="output all fields in original bed file, by default only "
        "the first 4 are output.")

    parser.add_argument(
        "--output-bed-headers",
        dest="bed_headers",
        type=str,
        help="supply ',' separated list of headers for bed component ")

    parser.add_argument(
        "-f",
        "--gff-file",
        dest="filename_gff",
        type=str,
        action="append",
        metavar='bed',
        help="filename with extra gff files. The order is important")

    parser.add_argument(
        "--has-header",
        dest="has_header",
        action="store_true",
        help="bed file with headers. Headers and first columns are "
        "preserved ")

    parser.set_defaults(genome_file=None,
                        counters=[],
                        bam_files=None,
                        offsets=[],
                        control_bam_files=None,
                        control_offsets=[],
                        all_fields=False,
                        filename_format=None,
                        bed_headers=None,
                        filename_gff=[],
                        has_header=False,
                        motif_sequence=None)

    (args) = E.start(parser)

    if args.bed_headers is not None:
        bed_headers = [x.strip() for x in args.bed_headers.split(",")]
        if len(bed_headers) < 3:
            raise ValueError("a bed file needs at least three columns")
    else:
        bed_headers = None

    if args.has_header:
        while 1:
            line = args.stdin.readline()
            if not line:
                E.warn("empty bed file with no header")
                E.stop()
                return
            if not line.startswith("#"):
                break
        bed_headers = line[:-1].split("\t")

    if "motif" in args.counters and not args.motif_sequence:
        raise ValueError("if using motif must specify a motif-sequence")

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.bam_files:
        bam_files = []
        for bamfile in args.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if args.control_bam_files:
        control_bam_files = []
        for bamfile in args.control_bam_files.split(","):
            control_bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        control_bam_files = None

    counters = []

    for c in args.counters:
        if c == "length":
            counters.append(CounterLength(fasta=fasta, options=args))

        elif c == "overlap":
            counters.append(
                CounterOverlap(filename=args.filename_gff[0],
                               fasta=fasta,
                               options=args))
            del args.filename_gff[0]
        elif c == "peaks":
            counters.append(
                CounterPeaks(bam_files,
                             args.offsets,
                             control_bam_files,
                             args.control_offsets,
                             options=args))
        elif c == "composition-na":
            counters.append(
                CounterCompositionNucleotides(fasta=fasta, options=args))
        elif c == "composition-cpg":
            counters.append(CounterCompositionCpG(fasta=fasta, options=args))
        elif c == "classifier-chipseq":
            counters.append(
                ClassifierChIPSeq(filename_gff=args.filename_gff,
                                  fasta=fasta,
                                  options=args,
                                  prefix=None))
            del args.filename_gff[0]

        elif c == "motif":
            counters.append(
                CounterMotif(fasta=fasta, motif=args.motif_sequence))

    extra_fields = None

    for bed in Bed.iterator(args.stdin):

        if extra_fields is None:

            # output explicitely given headers
            if bed_headers:
                if len(bed_headers) > bed.columns:
                    raise ValueError(
                        "insufficient columns (%i, expected %i) in %s" %
                        (bed.columns, len(bed_headers), str(bed)))

            else:
                bed_headers = Bed.Headers[:bed.columns]

            args.stdout.write("\t".join(bed_headers))
            args.stdout.write("\t" +
                              "\t".join([x.getHeader()
                                         for x in counters]) + "\n")

            extra_fields = list(range(len(bed_headers) - 3))

        for counter in counters:
            counter.update(bed)

        if args.all_fields:
            args.stdout.write(str(bed))
        else:
            args.stdout.write("\t".join(
                [bed.contig, str(bed.start),
                 str(bed.end)] + [bed.fields[x] for x in extra_fields]))
        for counter in counters:
            args.stdout.write("\t%s" % str(counter))

        args.stdout.write("\n")

    E.stop()
Exemplo n.º 20
0
def loadIntervalsFromBed(bedfile, track, outfile,
                         bamfiles, offsets):
    '''load intervals from :term:`bed` formatted files into database.

    Re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    '''

    tmpfile = P.getTempFile(".")

    headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position",
               "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end")

    tmpfile.write("\t".join(headers) + "\n")

    avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    mlength = int(PARAMS["calling_merge_min_interval_length"])

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(iotools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        # remove very short intervals
        if bed.end - bed.start < mlength:
            c.skipped_length += 1
            continue

        if replicates:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelineChipseq.countPeaks(
                    bed.contig, bed.start, bed.end,
                    samfiles, offsets)

            # nreads can be 0 if the intervals overlap only slightly
            # and due to the binning, no reads are actually in the
            # overlap region.  However, most of these intervals should
            # be small and have already be deleted via the
            # merge_min_interval_length cutoff.  do not output
            # intervals without reads.
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            npeaks, peakcenter, length, avgval, peakval, nprobes = (
                1,
                bed.start +
                (bed.end - bed.start) // 2,
                bed.end - bed.start,
                1,
                1,
                1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             ncpgs, ngenes, npeaks, nprobes, npromoters,
             bed.contig, bed.start, bed.end))) + "\n")

    if c.output == 0:
        E.warn("%s - no intervals")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_intervals" % track.asTable()

    statement = '''
    cgat csv2db %(csv2db_options)s
    --allow-empty-file
    --add-index=interval_id
    --table=%(tablename)s
    < %(tmpfilename)s
    > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Exemplo n.º 21
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    # IMS: new method: extend intervals by set amount
    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=("merge", "filter-genome", "bins", "block",
                                 "sanitize-genome", "shift", "extend",
                                 "filter-names", "rename-chr"),
                        help="method to apply")

    parser.add_argument("--num-bins",
                        dest="num_bins",
                        type=int,
                        help="number of bins into which to merge (used for "
                        "method `bins)")

    parser.add_argument("--bin-edges",
                        dest="bin_edges",
                        type=str,
                        help="bin_edges for binning method")

    parser.add_argument(
        "--binning-method",
        dest="binning_method",
        type=str,
        choices=("equal-bases", "equal-intervals", "equal-range"),
        help="method used for binning (used for method `bins` if no "
        "bin_edges is given)")

    parser.add_argument(
        "--merge-distance",
        dest="merge_distance",
        type=int,
        help="distance in bases over which to merge that are not "
        "directly adjacent")

    parser.add_argument(
        "--merge-min-intervals",
        dest="merge_min_intervals",
        type=int,
        help="only output merged intervals that are build from at least "
        "x intervals")

    parser.add_argument("--merge-by-name",
                        dest="merge_by_name",
                        action="store_true",
                        help="only merge intervals with the same name")

    parser.add_argument(
        "--merge-and-resolve-blocks",
        dest="resolve_blocks",
        action="store_true",
        help="When merging bed12 entrys, should blocks be resolved?")

    parser.add_argument("--merge-stranded",
                        dest="stranded",
                        action="store_true",
                        help="Only merge intervals on the same strand")

    parser.add_argument(
        "--remove-inconsistent-names",
        dest="remove_inconsistent_names",
        action="store_true",
        help="when merging, do not output intervals where the names of "
        "overlapping intervals do not match")

    parser.add_argument("--offset",
                        dest="offset",
                        type=int,
                        help="offset for shifting intervals")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_file",
                        type=str,
                        help="bam-formatted filename with genome.")

    parser.add_argument("--filter-names-file",
                        dest="names",
                        type=str,
                        help="list of names to keep. One per line")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        rename_chr_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent_names=False,
                        resolve_blocks=False)

    (args) = E.start(parser, add_pipe_options=True)

    contigs = None
    chr_map = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.bam_file:
        samfile = pysam.AlignmentFile(args.bam_file)
        contigs = dict(list(zip(samfile.references, samfile.lengths)))

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    processor = Bed.iterator(args.stdin)

    for method in args.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(
                processor,
                args.merge_distance,
                by_name=args.merge_by_name,
                min_intervals=args.merge_min_intervals,
                remove_inconsistent=args.remove_inconsistent_names,
                resolve_blocks=args.resolve_blocks,
                stranded=args.stranded)
        elif method == "bins":
            if args.bin_edges:
                bin_edges = list(map(float, args.bin_edges.split(",")))
                # IMS: check bin edges are valid
                if not (len(bin_edges) == args.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than "
                        "number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(processor,
                                                    num_bins=args.num_bins,
                                                    method=args.binning_method,
                                                    bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(processor, contigs, offset=args.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, args.offset)
        elif method == "filter-names":
            if not args.names:
                raise ValueError("please supply list of names to filter")
            names = [name.strip() for name in open(args.names)]
            processor = filterNames(processor, names)
        elif method == "rename-chr":
            if not chr_map:
                raise ValueError("please supply mapping file")
            processor = renameChromosomes(processor, chr_map)

    noutput = 0
    for bed in processor:
        args.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.stop()
Exemplo n.º 22
0
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"):
    '''aggregate output from several ``bedtools coverage`` results.

    ``bedtools coverage`` outputs the following columns for a bed4
    file::

    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    This method autodetects the number of columns in the :term:`infiles`
    and selects:

    * bed4: use column 5
    * bed6: use column 7
    * bed12: use column 13

    Arguments
    ---------
    infiles : list
        Input filenames with the output from ``bedtools coverage``
    outfile : string
        Output filename in :term:`tsv` format.
    regex : string
        Regular expression used to extract the track name from the
        filename.  The default removes any suffix.

    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        """<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" %
        (x, column) for x in infiles
    ])
    tmpfile = P.get_temp_filename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run(statement)

    # build track names
    tracks = [
        re.search(regex, os.path.basename(x)).groups()[0] for x in infiles
    ]

    outf = iotools.open_file(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    # filter for uniqueness - keys with the same value as the
    # previous line will be ignored.
    last_gene = None
    c = E.Counter()
    for line in open(tmpfile, "r"):
        c.input += 1
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]

        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        if genes[0] == last_gene:
            c.duplicates += 1
            continue
        c.output += 1
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
        last_gene = genes[0]

    outf.close()

    os.unlink(tmpfile)

    E.info("aggregateWindowsTagCounts: %s" % c)
Exemplo n.º 23
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()
from cgat import Bed
from cgat import IndexedFasta
from cgatcore import iotools
from cgat import Genomics


# In[3]:


genome = IndexedFasta.IndexedFasta("/shared/sudlab1/General/mirror/genomes/plain/hg38.fasta")


# In[7]:


bedfile = Bed.iterator(iotools.open_file(sys.argv[1]))
splice_site_dict = dict()
outfile = iotools.open_file(sys.argv[2], "w")
for utron in bedfile:
    
    ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2)
    ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end)
    if utron.strand == "+":
        splice_site_dict[utron.name] = (ss5_sequence, ss3_sequence)
        if ":" in utron.name:
            transcript_id = utron.name.split(":")[0]
            match_transcript_id = utron.name.split(":")[1]
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript_id, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start))
        else:
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (utron.name, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start))
Exemplo n.º 25
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input file is in gtf format")

    parser.add_argument("--set-name",
                        dest="name",
                        type=str,
                        help="field from the GFF/GTF file to use as the "
                        "name field in the BED file ",
                        choices=("gene_id", "transcript_id", "class", "family",
                                 "feature", "source", "repName",
                                 "gene_biotype"))

    parser.add_argument("--track",
                        dest="track",
                        type=str,
                        choices=("feature", "source", None),
                        help="use feature/source field to define BED tracks ")

    parser.add_argument(
        "--bed12-from-transcripts",
        dest="bed12",
        action="store_true",
        default=False,
        help="Process GTF file into Bed12 entries, with blocks as exons"
        "and thick/thin as coding/non-coding")

    parser.set_defaults(track=None, name="gene_id", is_gtf=False)

    (args) = E.start(parser, add_pipe_options=True)

    ninput, noutput = 0, 0

    iterator = GTF.iterator(args.stdin)

    if args.bed12:
        iterator = GTF.transcript_iterator(iterator)

    if args.track:
        all_input = list(iterator)

        if args.track == "feature":
            grouper = lambda x: x.feature
        elif args.track == "source":
            grouper = lambda x: x.source

        all_input.sort(key=grouper)

        bed = Bed.Bed()
        for key, vals in itertools.groupby(all_input, grouper):
            args.stdout.write("track name=%s\n" % key)
            for gff in vals:
                ninput += 1

                if args.bed12:
                    bed = transcript2bed12(gff)
                else:
                    bed.fromGTF(gff, name=args.name)

                args.stdout.write(str(bed) + "\n")
                noutput += 1

    else:
        bed = Bed.Bed()
        for gff in iterator:
            ninput += 1

            if args.bed12:
                bed = transcript2bed12(gff)
            else:
                bed.fromGTF(gff, name=args.name)

            args.stdout.write(str(bed) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i" % (ninput, noutput))
    E.stop()
Exemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    parser = buildOptionParser(argv)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError(
            "please specify one bam- or wig-file and one bed file")

    if options.control_files:
        E.info("using control files: %s" % ",".join(options.control_files))

    infile, bedfile = args
    control_files = []

    if options.format == "bigwig":
        fg_file = pyBigWig.open(infile)
        for control_file in options.control_files:
            control_files.append(pyBigWig.open(control_file))
        counter = bam2peakshape.CounterBigwig(
            smooth_method=options.smooth_method)

    elif options.format == "bam":
        fg_file = pysam.AlignmentFile(infile, "rb")
        for control_file in options.control_files:
            control_files.append(pysam.AlignmentFile(control_file, "rb"))
        counter = bam2peakshape.CounterBam(shift=options.shift,
                                           smooth_method=options.smooth_method)

    features_per_interval, bins = buildDensityMatrices(
        Bed.iterator(iotools.open_file(bedfile)),
        fg_file,
        control_files,
        counter,
        window_size=options.window_size,
        bin_size=options.bin_size,
        strand_specific=options.strand_specific,
        centring_method=options.centring_method,
        use_interval=options.use_interval,
        random_shift=options.random_shift,
        smooth_method=options.smooth_method,
        report_step=options.report_step)

    if len(features_per_interval) == 0:
        E.warn("no data - no output")
        E.stop()
        return

    outputFeatureTable(options.stdout, features_per_interval, bins)

    # apply normalization
    # Note: does not normalize control?
    # Needs reworking, currently it does not normalize across
    # all samples nor does the work "sum" reflect the per million
    # normalization.
    if options.normalization == "sum":
        E.info("starting sum normalization")
        # get total counts across all intervals
        norm = 0.0
        for foreground, bed, controls, shifted in features_per_interval:
            norm += sum(foreground.counts)
        # per million
        norm /= float(1000000)
        E.info("sum/million normalization with %f" % norm)

        # normalise
        new_data = []
        for foreground, bed, controls, shifted in features_per_interval:
            foreground = foreground._replace(
                counts=numpy.array(foreground.counts, dtype=numpy.float) /
                norm)
            new_controls = []
            for control in controls:
                new_controls.append(
                    control._replace(
                        counts=numpy.array(control.counts, dtype=numpy.float) /
                        norm))
            if shifted:
                shifted = shifted._replace(
                    counts=numpy.array(shifted.counts, dtype=numpy.float) /
                    norm)
            new_data.append(
                IntervalData._make((foreground, bed, new_controls, shifted)))
        features_per_interval = new_data
    else:
        E.info("no normalization performed")

    # center bins
    out_bins = bins[:-1] + options.bin_size

    # build tracks
    def _toTrack(filename):
        return os.path.splitext(os.path.basename(filename))[0]

    outputMatrices(features_per_interval,
                   out_bins,
                   foreground_track=_toTrack(infile),
                   control_tracks=[_toTrack(x) for x in options.control_files],
                   shifted=options.random_shift,
                   sort_orders=options.sort_orders)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 27
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    bam = pysam.AlignmentFile(args[0])

    outlines = list()
    
    for junction in Bed.iterator(options.stdin):

        reads = bam.fetch(junction.contig,
                          junction.start - 1,
                          junction.start + 1)
        retained_reads = 0
        spliced_reads = 0
        incompatible = 0
        
        reads = list(reads)
        total = len(reads)
        
        if len(reads) == 0:
            continue
        
        for read in reads:

            if read.get_tag("NH") > 1:
                continue
            
            found = False
            if 'N' not in read.cigarstring and \
               read.pos < junction.start and \
               read.aend > junction.start:
                found = True
                retained_reads += 1
                continue
                
            segments = read.get_blocks()

            for i in range(len(segments) - 1):

                if segments[i][0] < junction.start and\
                   segments[i][1] > junction.start:
                    found = True
                    retained_reads += 1
                elif abs(segments[i][1] - junction.start) < 3 and\
                   abs(segments[i+1][0]  - junction.end) < 3:
                    found = True
                    spliced_reads += 1

            if not found and \
               segments[-1][0] < junction.start and \
               segments[-1][1] > junction.end:
                retained_reads += 1
            else:
                incompatible += 1
                

        if spliced_reads + retained_reads > 0:
            psi = retained_reads/float(spliced_reads + retained_reads)
        else:
            psi = "NA"
            
        options.stdout.write("\t".join(map(str, [junction.contig,
                                                 junction.start,
                                                 junction.end,
                                                 junction.name,
                                                 retained_reads,
                                                 spliced_reads,
                                                 incompatible,
                                                 total,
                                                 psi])) +
                             "\n")
     
    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 28
0
def annotateCpGIslands(infiles, outfile):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex(iotools.openFile(cpgfile))

    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = iotools.openFile(outfile, "w")
    outf.write(
        "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n")

    for tss in Bed.iterator(iotools.openFile(tssfile)):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - \
                extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - \
                extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find(start, end))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start
            else:
                relative_start = tss.end - genome_end

            relative_end = relative_start + l

            outf.write("\t".join(
                map(str, (tss.name, tss.strand, genome_start, genome_end,
                          relative_start, relative_end))) + "\n")
            c.matches_output += 1

    outf.close()

    with iotools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Exemplo n.º 29
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--reffile",
                      dest="reffile",
                      type="string",
                      help="Supply reference gtf file name")

    parser.add_option("-d",
                      "--class-file",
                      dest="classfile",
                      type="string",
                      help="Supply database name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output bed file name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply output bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply output bed file name for partnered utrons")
    parser.add_option(
        "-q",
        "--indivpartfile",
        dest="indivpartfile",
        type="string",
        help="Supply output bed file name for individual partnered utrons")
    parser.add_option("-n",
                      "--novel-file",
                      dest="novelfile",
                      type="string",
                      help="Supply output bed file name for novel introns")
    parser.add_option(
        "--novel-transcript",
        dest="novel_id",
        type="string",
        help="DEBUG: Output info for this transcript from the STDIN")
    parser.add_option(
        "--target-transcript",
        dest="target_id",
        type="string",
        help="DEBUG: Output info for this transcript from ref-file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    outlines = []
    individuals = []
    partnered = []
    individualpartnered = []
    novel = []

    db = pandas.read_csv(options.classfile, sep="\t")

    # This keeps just one entry per-transcript - why?
    #db = db.groupby("transcript_id").first()
    db = db.set_index("transcript_id")
    enshashtable = getGeneTable(options.reffile)

    for novel_transcript in GTF.transcript_iterator(GTF.iterator(
            options.stdin)):

        # Why do it on a gene by gene basis rather than transcript by transcript basis?
        transcript_id = novel_transcript[0].transcript_id

        if transcript_id == options.novel_id:
            output_novel = True
        else:
            output_novel = False

        try:
            geneid = db.loc[transcript_id].match_gene_id
        except KeyError:
            if output_novel:
                E.debug("Transcript %s not in class table" % transcript_id)
            continue

        if pandas.isnull(geneid):
            if output_novel:
                E.debug("Transcript %s matches no gene in class table" %
                        transcript_id)
            continue

        ens_gene = enshashtable[geneid]

        all_ref_introns = set()
        novel_transcript_exons = GTF.asRanges(novel_transcript, "exon")
        novel_transcript_introns = GTF.toIntronIntervals(novel_transcript)
        for ref_transcript in ens_gene["models"].values():
            ref_introns = GTF.toIntronIntervals(ref_transcript)
            all_ref_introns.update(ref_introns)

        #Identify comparison set
        def _in_exon(position, exons):
            return any(e[0] <= position <= e[1] for e in exons)

        # check if this ever gets the wrong start_codon.
        filtered_starts = [
            s for s in ens_gene["start_codons"]
            if _in_exon(s, novel_transcript_exons)
        ]

        if len(filtered_starts) == 0:
            if output_novel:
                E.debug("No starts found for %s" % transcript_id)
            continue

        #if novel_transcript[0].strand == "-":
        #    selected_start = max(filtered_starts)
        #else:
        #    selected_start = min(filtered_starts)

        selected_models = list()
        for startc in filtered_starts:
            selected_models.extend(ens_gene["start_codons"][startc])

        if output_novel:
            E.debug("Transcripts with compatible starts are %s" %
                    selected_models)

        for ref_transcript_id in selected_models:

            if output_novel and ref_transcript_id == options.target_id:
                output_ref = True
            else:
                output_ref = False

            second = ens_gene["models"][ref_transcript_id]
            ens_CDS = GTF.asRanges(second, "CDS")

            if len(ens_CDS) == 0:
                if output_ref:
                    E.debug("%s is not coding"
                            )  # ensure only protein-coding transcripts
                continue

            ens_exons = GTF.asRanges(second, "exon")

            first_introns = set(novel_transcript_introns)
            second_introns = set(GTF.toIntronIntervals(second))

            first_CDSintrons = [
                intron for intron in first_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            second_CDSintrons = [
                intron for intron in second_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            first_CDSintrons = set(first_CDSintrons)
            second_CDSintrons = set(second_CDSintrons)

            if not first_CDSintrons == second_CDSintrons:
                if output_ref:
                    E.debug("CDS chains do not match. Chains are:")
                    first_CDSintrons = sorted(list(first_CDSintrons))
                    second_CDSintrons = sorted(list(second_CDSintrons))
                    output = "\n".join(
                        map(str, zip(first_CDSintrons, second_CDSintrons)))
                    E.debug(output)
                continue  # match CDS intron chain

            firstUTRintrons = first_introns - first_CDSintrons

            if len(firstUTRintrons) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            secondUTRintrons = second_introns - second_CDSintrons

            found = False
            for intron in first_introns:
                if (intron[0] < ens_CDS[-1][1] and
                    intron[1] > ens_CDS[-1][1]) or \
                    (intron[0] < ens_CDS[0][0] and
                     intron[1] > ens_CDS[0][0]):

                    found = True
                    break  # ensure pruned transcript doesn't have
                    # introns overlapping start or stop codons in ensembl
                    # transcript
            if found:
                if output_ref:
                    E.debug("Start or stop in intron")
                continue

            if second[0].strand == "+":
                ens_stop = ens_CDS[-1][1]
                UTR3introns = [
                    intron for intron in firstUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
            else:
                ens_stop = ens_CDS[0][0]
                UTR3introns = [
                    intron for intron in firstUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]

            if len(UTR3introns) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            outbed = Bed.Bed()
            outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.']
            outbed.fromIntervals(UTR3introns)
            outbed.contig = novel_transcript[0].contig
            outbed["name"] = novel_transcript[0].transcript_id
            outbed["strand"] = novel_transcript[0].strand
            outlines.append(outbed)  # get output for each transcript

            for item in UTR3introns:
                outbed2 = Bed.Bed()
                outbed2.fields = ['.', '.', '.', '.']
                outbed2.fromIntervals([item])
                outbed2.contig = novel_transcript[0].contig
                outbed2['name'] = novel_transcript[0].transcript_id
                outbed2["strand"] = novel_transcript[0].strand
                outbed2["thickStart"] = ens_stop
                individuals.append(outbed2)  # get output for each intron

            UTR3introns = set(UTR3introns)
            secondUTR3introns = set(secondUTR3introns)
            extraUTR3introns = list(UTR3introns - secondUTR3introns)

            if output_ref and len(secondUTR3introns - UTR3introns) > 0:
                E.debug("Following introns in UTR of %s but not %s" %
                        (options.target_id, options.novel_id))
                E.debug(secondUTRintrons - UTR3introns)

            # get only introns that are not in matched transcript
            if len(extraUTR3introns) != 0 and len(secondUTR3introns -
                                                  UTR3introns) == 0:
                outbed3 = Bed.Bed()
                outbed3.fields = ['.'] * 9
                outbed3.fromIntervals(extraUTR3introns)
                outbed3.contig = novel_transcript[0].contig
                outbed3["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed3["strand"] = novel_transcript[0].strand
                partnered.append(outbed3)

                for item in extraUTR3introns:
                    outbed4 = Bed.Bed()
                    outbed4.fields = ['.', '.', '.', '.']
                    outbed4.fromIntervals([item])
                    outbed4.contig = novel_transcript[0].contig
                    outbed4["name"] = novel_transcript[
                        0].transcript_id + ":" + second[0].transcript_id
                    outbed4["strand"] = novel_transcript[0].strand
                    outbed4["thickStart"] = ens_stop
                    individualpartnered.append(outbed4)

            if len(all_ref_introns) == 0:
                ens_starts, ens_ends = [], []
            else:
                ens_starts, ens_ends = zip(*all_ref_introns)

            novelEvents = [
                i for i in UTR3introns
                if i[0] not in ens_starts and i[1] not in ens_ends
            ]

            for item in novelEvents:
                outbed5 = Bed.Bed()
                outbed5.fields = ['.'] * 4
                outbed5.fromIntervals([item])
                outbed5.contig = novel_transcript[0].contig
                outbed5["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed5["strand"] = novel_transcript[0].strand
                outbed5["thickStart"] = ens_stop
                novel.append(outbed5)

    with IOTools.open_file(options.outfile, "w") as outf:
        for line in outlines:
            outf.write(str(line) + "\n")

    if options.indivfile is not None:
        with IOTools.open_file(options.indivfile, "w") as outf2:
            for line in individuals:
                outf2.write(str(line) + "\n")

    if options.partfile is not None:
        with IOTools.open_file(options.partfile, "w") as outf3:
            for line in partnered:
                outf3.write(str(line) + "\n")

    if options.indivpartfile is not None:
        with IOTools.open_file(options.indivpartfile, "w") as outf4:
            for line in individualpartnered:
                outf4.write(str(line) + "\n")

    if options.novelfile is not None:
        with IOTools.open_file(options.novelfile, "w") as outf5:
            for line in novel:
                outf5.write(str(line) + "\n")
    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 30
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "-g", "--genome-file", dest="genome_file", type=str,
        help="filename with genome.")

    parser.add_argument(
        "-a", "--aggregate-by", dest="aggregate", type=str,
        choices=("name", "contig", "track", "none"),
        help="aggregate counts by feature.")

    parser.add_argument(
        "-p", "--add-percent", dest="add_percent", action="store_true",
        help="add percentages.")

    parser.set_defaults(
        genome_file=None,
        aggregate="none",
        add_percent=False,
    )

    (args) = E.start(parser, argv)

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        if args.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if args.add_percent and not args.aggregate == "contig":
        raise NotImplementedError(
            "--add-percent option requires --aggregate=contig")

    counts = collections.defaultdict(Counter)
    total = Counter()
    output_totals = True

    if args.aggregate == "track":
        keyf = lambda x: x.track
    elif args.aggregate == "name":
        keyf = lambda x: x.name
    elif args.aggregate == "contig":
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"
        output_totals = False

    for bed in Bed.iterator(args.stdin):
        counts[keyf(bed)].add(bed)
        total.add(bed)

    outf = args.stdout

    key = "track"
    if args.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in sorted(counts.items()):
        if args.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    if output_totals:
        if args.add_percent:
            count.setSize(total_bases)
        outf.write("%s\t%s\n" % ("total", str(total)))
    E.stop()