Exemplo n.º 1
0
def readAndIndex(iterator, with_value=True):
    '''read from gtf stream and index.

    returns an :class:`IndexedGenome.IndexedGenome`
    '''

    if with_value:
        index = IndexedGenome.IndexedGenome()
        for gtf in iterator:
            index.add(gtf.contig, gtf.start, gtf.end, gtf)
    else:
        index = IndexedGenome.Simple()
        for gtf in iterator:
            index.add(gtf.contig, gtf.start, gtf.end)

    return index
Exemplo n.º 2
0
    def __call__(self, track, slice = None ):

        result = odict()

        merged = None
        rocs = []

        for field in self.mFields:
            data = []
            for replicate in EXPERIMENTS.getTracks( track ):
                statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals()
                data.append( self.get( statement) )

            idx = []
            for x in range(len(data)):
                i = IndexedGenome.IndexedGenome()
                for contig, start, end, peakval in data[x]:
                    i.add( contig, start, end, peakval )
                idx.append( i )

            def _iter( all ):
                all.sort()
                last_contig, first_start, last_end, last_value = all[0]
                for contig, start, end, value in all[1:]:
                    if contig != last_contig or last_end < start:
                        yield (last_contig, first_start, last_end) 
                        last_contig, first_start, last_end = contig, start, end
                    else:
                        last_end = max(last_end, end )
                yield (last_contig, first_start, last_end) 

            if not merged:
                all =  [ x for x in itertools.chain( *data ) ]
                merged = list( _iter(all) )

            roc_data = []
            for contig, start, end in merged:
                intervals = []
                for i in idx:
                    try:
                        intervals.append( list(i.get( contig, start, end )) )
                    except KeyError:
                        continue

                if len(intervals) == 0:
                    continue

                is_repro = len( [ x for x in intervals if x != [] ] ) == len(data)
                value = max( [ x[2] for x in itertools.chain( *intervals )] )

                # fpr, tpr
                roc_data.append( (value, is_repro) )

            roc_data.sort()
            roc_data.reverse()
            
            roc = zip(*Stats.computeROC( roc_data ))
            result[field] = odict( (("FPR", roc[0]), (field,roc[1])) )
            
        return result
Exemplo n.º 3
0
    def __init__(self, filename_junctions, *args, **kwargs):
        BaseAnnotator.__init__(self, *args, **kwargs)

        junctions = IndexedGenome.IndexedGenome()

        infile = open(filename_junctions, "r")
        njunctions = 0

        for line in infile:
            if line.startswith("#"): continue
            data = line[:-1].split("\t")
            if data[0] == "contig": continue
            # end, start are the positions of the last base of the codon
            # 5' of the intron and first base of codon 3' of the intron.
            contig, strand, end, start, frame, gene_id, transcript_id = data
            start, end, frame = int(start), int(end), int(frame)
            # convert to intron coordinates
            intron_start, intron_end = end + 1, start
            # convert to positive strand coordinates
            if strand == "-":
                lcontig = self.mFasta.getLength(contig)
                intron_start, intron_end = lcontig - intron_end, lcontig - intron_start
            junctions.add(
                contig, intron_start, intron_start + self.mSize,
                (strand, intron_start, intron_end, gene_id, transcript_id))
            junctions.add(
                contig, intron_end - self.mSize, intron_end,
                (strand, intron_start, intron_end, gene_id, transcript_id))
            njunctions += 1
        infile.close()

        self.mJunctions = junctions

        E.info("read and indexed %i junctions for %i contigs" %
               (njunctions, len(junctions)))
Exemplo n.º 4
0
def readIntervals(infile, options):

    ninput = 0
    t = time.time()

    if options.format == "gtf":

        index = IndexedGenome.IndexedGenome()

        for gffs in GTF.transcript_iterator(GTF.iterator(infile)):

            ali = alignlib_lite.py_makeAlignmentBlocks()
            for gff in gffs:
                if gff.feature != "exon":
                    continue
                ali.addDiagonal(gff.start, gff.end, 0)

            index.add(min([x.start for x in gffs]),
                      max([x.end for x in gffs]),
                      ali)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput,
                       time.time() - t, float(time.time() - t) / ninput))

    elif options.format == "gff":

        index = IndexedGenome.Simple()

        for g in GTF.iterator(infile):

            index.add(g.contig, g.start, g.end)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput, time.time() - t,
                       float(time.time() - t) / ninput))

    E.info("read intervals: %i contigs, %i intervals" % (len(index), ninput))
    return index
Exemplo n.º 5
0
    def __init__(self, filename_exons, *args, **kwargs):
        BaseAnnotator.__init__(self, *args, **kwargs)

        exons = IndexedGenome.IndexedGenome()
        nexons = 0
        for g in GTF.iterator(open(filename_exons, "r")):
            exons.add(g.contig, g.start, g.end, g)
            nexons += 1

        self.mExons = exons

        E.info("indexed %i exons on %i contigs" % (nexons, len(exons)))
Exemplo n.º 6
0
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(IOTools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
Exemplo n.º 7
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
Exemplo n.º 8
0
def annotateWindows(contig, windows, gff_data, fasta, options):
    """annotate windows."""

    index = IndexedGenome.IndexedGenome()
    for g in gff_data:
        index.add(g.contig, g.start, g.end, g)

    is_gtf = options.is_gtf

    if options.transform == "none":
        transform = lambda x, y, z: [(x[0], x[1]) for x in z]
    elif options.transform == "overlap":
        transform = transform_overlap
    elif options.transform == "complement":
        transform = transform_complement
    elif options.transform == "third_codon":
        transform = transform_third_codon
    else:
        raise ValueError("unknown transform %s" % options.transform)

    work_on_intervals = True
    if options.decorator == "counts":
        decorator = decorator_counts
    elif options.decorator == "mean-length":
        decorator = decorator_mean_length
    elif options.decorator == "median-length":
        decorator = decorator_median_length
    elif options.decorator == "percent-coverage":
        decorator = decorator_percent_coverage
    elif options.decorator == "gc":
        decorator = decorator_percent_gc
    elif options.decorator == "median-score":
        decorator = decorator_median_score
        work_on_intervals = False
    elif options.decorator == "mean-score":
        decorator = decorator_mean_score
        work_on_intervals = False
    elif options.decorator == "stddev-score":
        decorator = decorator_stddev_score
        work_on_intervals = False
    elif options.decorator == "min-score":
        decorator = decorator_min_score
        work_on_intervals = False
    elif options.decorator == "max-score":
        decorator = decorator_max_score
        work_on_intervals = False
    else:
        raise ValueError("unknown decorator %s" % options.decorator)

    for start, end in windows:

        # counts/length before/after transformation
        n1, l1, n2, l2 = 0, 0, 0, 0

        values, intervals_with_gff, genes, transcripts = [], [], set(), set()

        try:
            for istart, iend, value in index.get(contig, start, end):
                n1 += 1
                l1 += iend - istart
                intervals_with_gff.append((istart, iend, value))
                values.append(value.score)
                if is_gtf:
                    genes.add(value.gene_id)
                    transcripts.add(value.transcript_id)
        except KeyError:
            pass

        if n1 == 0 and options.skip_empty:
            continue

        if work_on_intervals:

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i before transformation: %s\n" %
                    (start, end, str(intervals)))

            intervals = transform(start, end, intervals_with_gff)

            for xstart, xend in intervals:
                n2 += 1
                l2 += xend - xstart

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i after transformation: %s\n" %
                    (start, end, str(intervals)))

            score, extra_info = decorator(intervals, start, end, contig, fasta)

        else:
            if len(values) > 0:
                values = list(map(float, values))
                score, extra_info = decorator(values, start, end, contig)
            else:
                score, extra_info = 0, None

            l2 = 0
            n2 = 0

        if is_gtf:
            ngenes, ntranscripts = len(genes), len(transcripts)
        else:
            ngenes, ntranscripts = 0, 0

        if extra_info:
            extra_info = re.sub("\t", ";", extra_info)
        options.stdout.write("\t".join(
            map(str, (contig, start, end, ngenes, ntranscripts, n1, l1, n2, l2,
                      score, extra_info))) + "\n")
Exemplo n.º 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.read_map(
            IOTools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("--bed-file",
                      dest="infiles",
                      type="string",
                      metavar="bed",
                      help="supply list of bed files",
                      action="append")

    parser.set_defaults(infiles=[])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    options.infiles.extend(args)
    if len(options.infiles) == 0:
        raise ValueError('please provide at least 1 bed file')

    E.info("concatenating bed files")
    # concatenate the list of files
    tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w")
    infs = options.infiles
    for inf in infs:
        for bed in Bed.iterator(IOTools.open_file(inf)):
            tmp.write("%s\n" % bed)
    tmp.close()

    E.info("merging bed entries")
    # merge the bed entries in the file
    name = tmp.name
    tmp_bed = pybedtools.BedTool(name)
    tmp_bed.sort().merge().saveas(tmp_merge.name)
    tmp_merge.close()

    E.info("indexing bed entries")
    # index the bed entries
    merged = IndexedGenome.Simple()
    for bed in Bed.iterator(IOTools.open_file(tmp_merge.name)):
        merged.add(bed.contig, bed.start, bed.end)

    counts = collections.defaultdict(int)
    # list of samples
    samples = options.infiles

    E.info("counting no. samples overlapping each interval")
    for sample in samples:
        found = set()
        for bed in Bed.iterator(IOTools.open_file(sample)):
            if merged.contains(bed.contig, bed.start, bed.end):
                key = [bed.contig] + \
                    [x for x in merged.get(bed.contig, bed.start, bed.end)]
                key = (key[0], key[1][0], key[1][1])
                if key in found:
                    continue
                found.add(key)

                # tuple of interval description as key - (contig, start, end)
                counts[key] += 1

    # open outfile
    options.stdout.write("contig\tstart\tend\tcount\n")

    E.info("outputting result")
    for interval, count in sorted(counts.items()):
        options.stdout.write("\t".join(map(str, interval)) + "\t" +
                             str(count) + "\n")

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 11
0
def annotateWindows(contig, windows, gff_data, fasta, options):
    """annotate windows."""

    index = IndexedGenome.IndexedGenome()
    for g in gff_data:
        index.add(g.contig, g.start, g.end, g)

    w = GTF.Entry()
    w.contig = contig
    w.feature = "count"

    is_gtf = options.is_gtf

    if options.transform == "none":
        transform = lambda x, y, z: map(lambda x: (x[0], x[1]), z)
    elif options.transform == "overlap":
        transform = transform_overlap
    elif options.transform == "complement":
        transform = transform_complement
    elif options.transform == "third_codon":
        transform = transform_third_codon
    else:
        raise ValueError("unknown transform %s" % options.transform)

    work_on_intervals = True
    if options.decorator == "counts":
        decorator = decorator_counts
    elif options.decorator == "mean-length":
        decorator = decorator_mean_length
    elif options.decorator == "median-length":
        decorator = decorator_median_length
    elif options.decorator == "percent-coverage":
        decorator = decorator_percent_coverage
    elif options.decorator == "gc":
        decorator = decorator_percent_gc
    elif options.decorator == "median-score":
        decorator = decorator_median_score
        work_on_intervals = False
    elif options.decorator == "mean-score":
        decorator = decorator_mean_score
        work_on_intervals = False
    elif options.decorator == "stddev-score":
        decorator = decorator_stddev_score
        work_on_intervals = False
    elif options.decorator == "min-score":
        decorator = decorator_min_score
        work_on_intervals = False
    elif options.decorator == "max-score":
        decorator = decorator_max_score
        work_on_intervals = False
    else:
        raise ValueError("unknown decorator %s" % options.decorator)

    for start, end in windows:

        # counts/length before/after transformation
        n1, l1, n2, l2 = 0, 0, 0, 0

        values, intervals_with_gff, genes, transcripts = [], [], set(), set()

        try:
            for istart, iend, value in index.get(contig, start, end):
                n1 += 1
                l1 += iend - istart
                intervals_with_gff.append((istart, iend, value))
                values.append(value.score)
                if is_gtf:
                    genes.add(value.gene_id)
                    transcripts.add(value.mTransciptId)
        except KeyError:
            pass

        if n1 == 0 and options.skip_empty:
            continue

        if work_on_intervals:

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i before transformation: %s\n" %
                    (start, end, str(intervals)))

            intervals = transform(start, end, intervals_with_gff)

            for xstart, xend in intervals:
                n2 += 1
                l2 += xend - xstart

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i after transformation: %s\n" %
                    (start, end, str(intervals)))

            w.score, extra_info = decorator(intervals, start, end, contig,
                                            fasta)

        else:
            if len(values) > 0:
                values = map(float, values)
                w.score, extra_info = decorator(values, start, end, contig)
            else:
                w.score, extra_info = 0, None

            l2 = 0
            n2 = 0
        w.start = start
        w.end = end
        w.clearAttributes()
        w.addAttribute("n1", n1)
        w.addAttribute("l1", l1)
        w.addAttribute("n2", n2)
        w.addAttribute("l2", l2)
        if extra_info:
            w.addAttribute("extra", extra_info)
        options.stdout.write(str(w) + "\n")