Пример #1
0
def plot_manhattan_plot(dataframe,
                        section,
                        filename_fasta,
                        map_key2label={},
                        **kwargs):

    plotter = ManhattanPlot(genome_size_file=filename_fasta)
    ax = plotter(dataframe, **kwargs)
    plt.savefig(E.get_output_file(section))
    plt.close()
Пример #2
0
def plot_mutation_profile_bar_plot(dataframe,
                                   section,
                                   map_key2label={},
                                   **kwargs):

    for key, dataframe in dataframe.groupby(by="sample"):
        if key == "unique":
            continue

        if dataframe.empty:
            E.warn("no data for {}".format(key))
            continue

        ax = MutationProfileBarPlot()(dataframe)

        label = map_key2label.get(key, key)
        plt.savefig(E.get_output_file("-".join((section, label))))
        plt.close()
Пример #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
Пример #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--input-bed-file",
        dest="input_bed_file",
        type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option(
        "-m",
        "--merge-intervals",
        dest="merge_intervals",
        action="store_true",
        help="merge intervals in bed file. Useful if you have a site bed-file "
        "[%default]")

    parser.add_option("-f",
                      "--reference-fasta-file",
                      dest="reference_fasta_file",
                      help="reference genomic sequence in fasta format. "
                      "[%default]")

    parser.add_option(
        "-c",
        "--barcode-fasta-file",
        dest="barcode_fasta_file",
        help="barcode sequence in fasta format. Variable positions "
        "should be marked by N "
        "[%default]")

    parser.set_defaults(
        reference_fasta_file=None,
        barcode_fasta_file=None,
        merge_intervals=False,
        input_bed_file=None,
        anchor=5,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.stdin != sys.stdin:
        bamfile = options.stdin.name
    elif args:
        if len(args) > 1:
            raise ValueError("multiple bam files provided in arguments")
        bamfile = args[0]
    else:
        bamfile = "-"

    if options.barcode_fasta_file:
        with pysam.FastxFile(options.barcode_fasta_file) as inf:
            barcode_sequence = next(inf).sequence
    else:
        barcode_sequence = None

    if not os.path.exists(options.reference_fasta_file):
        raise OSError("reference fasta file {} does not exist".format(
            options.reference_fasta_file))

    if not os.path.exists(options.input_bed_file):
        raise OSError("input bed file {} does not exist".format(
            options.input_bed_file))

    bed_in = pysam.TabixFile(options.input_bed_file)
    pysam_in = pysam.AlignmentFile(bamfile)
    anchor = options.anchor

    for region_idx, vals in enumerate(
            iterate_bed(bed_in, options.merge_intervals)):

        if region_idx > 0:
            raise NotImplementedError(
                "output for multiple regions not yet implemented")

        contig, region_start, region_end = vals
        upstream_anchors, downstream_anchors = [], []
        counter = E.Counter()

        unaligned_fn = E.get_output_file(
            "unaligned_{}.fasta".format(region_idx))
        with IOTools.open_file(unaligned_fn, "w") as outf:
            for read in pysam_in.fetch(contig, region_start, region_end):
                counter.overlapping_reads += 1
                try:
                    pairs = read.get_aligned_pairs(with_seq=True)
                except ValueError:
                    counter.no_md_tag += 1
                    continue

                map_ref2read_pos = dict(
                    (x[1], x[0]) for x in pairs if x[0] is not None)
                map_ref2ref_base = dict(
                    (x[1], x[2]) for x in pairs if x[0] is not None)

                upstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_start - anchor, region_start))

                downstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_end, region_end + anchor))

                # check if at least one anchor is aligned
                upstream_matches = sum([x.isupper() for x in upstream_anchor])
                downstream_matches = sum(
                    [x.isupper() for x in downstream_anchor])

                if upstream_matches < anchor and downstream_matches < anchor:
                    counter.no_anchor += 1
                    continue
                seq = read.query_alignment_sequence

                # collect full length anchors
                upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start
                downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor

                if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos:
                    upstream_anchors.append(
                        seq[map_ref2read_pos[upstream_anchor_start]:
                            map_ref2read_pos[upstream_anchor_end]])
                if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos:
                    downstream_anchors.append(
                        seq[map_ref2read_pos[downstream_anchor_start]:
                            map_ref2read_pos[downstream_anchor_end]])

                # get region to align
                read_start = min(
                    (map_ref2read_pos.get(x, len(seq))
                     for x in range(region_start - anchor, region_start)))
                if read_start == len(seq):
                    read_start = 0
                read_end = max(
                    (map_ref2read_pos.get(x, 0) + 1
                     for x in range(region_end, region_end + anchor)))
                if read_end == 1:
                    read_end = len(seq)
                counter.collected_reads += 1
                outf.write(">{}/{}-{}\n{}\n".format(read.query_name,
                                                    read_start, read_end,
                                                    seq[read_start:read_end]))
        counter.downstream_anchors = len(downstream_anchors)
        counter.upstream_anchors = len(upstream_anchors)

        E.info(counter)

        if counter.overlapping_reads == 0:
            E.warn("no sequences overlapping region")
            continue

        if counter.downstream_anchors == 0 or counter.upstream_anchors == 0:
            E.warn("at least one anchor undefined")
            continue

        if counter.collected_reads == 1:
            E.warn("only single sequence, multiple aligment skipped")
            with IOTools.open_file(unaligned_fn) as inf:
                stdout = inf.read()
        else:
            # G-INS-i -> global alignment algorithm
            E.info("starting mafft multiple alignment")
            stdout = E.run(
                "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}"
                .format(unaligned_fn),
                return_stdout=True)

        aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx))
        with IOTools.open_file(aligned_fn, "w") as outf:
            outf.write(stdout)

        mali = stdout.splitlines()
        identifiers = [mali[x] for x in range(0, len(mali), 2)]
        sequences = [mali[x].upper() for x in range(1, len(mali), 2)]
        consensus = get_consensus(sequences)

        E.info("after alignment: consensus={}".format(consensus))

        # gap filtering -> remove highly gappy columns
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after anchor trimming: consensus={}".format(consensus))

        take = [idx for idx, x in enumerate(consensus) if x != "-"]
        sequences = ["".join([s[x] for x in take]) for s in sequences]
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after gap filtering: consensus={}".format(consensus))

        # get anchor consensus and chop it off
        consensus = get_consensus(sequences, ignore_gaps=True)
        upstream_anchor = get_anchor_consensus(upstream_anchors)
        downstream_anchor = get_anchor_consensus(downstream_anchors)

        upstream_anchor_start = consensus.find(upstream_anchor)
        downstream_anchor_start = consensus.rfind(downstream_anchor)

        E.info(
            "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}"
            .format(consensus, upstream_anchor, downstream_anchor,
                    upstream_anchor_start, downstream_anchor_start))

        if upstream_anchor_start < 0 or downstream_anchor_start < 0:
            E.warn("can't locate anchor, no output produced")
            continue

        upstream_anchor_end = upstream_anchor_start + len(upstream_anchor)
        if upstream_anchor_end >= downstream_anchor_start:
            E.warn("anchor not in correct order, no output produced")
            continue

        sequences = [
            x[upstream_anchor_end:downstream_anchor_start] for x in sequences
        ]
        consensus = get_consensus(sequences)

        E.info("after anchor trimming: consensus={}".format(consensus))

        truncated_fn = E.get_output_file(
            "aligned_truncated_{}.fasta".format(region_idx))
        with IOTools.open_file(truncated_fn, "w") as outf:
            outf.write("\n".join("{}\n{}\n".format(x, y)
                                 for x, y in zip(identifiers, sequences)))

        positions = list(zip(*sequences))
        bases = ["A", "C", "G", "T"]
        df = pandas.DataFrame([collections.Counter(x)
                               for x in positions]).fillna(0)
        for missing_base in [x for x in bases if x not in df.columns]:
            df[missing_base] = 0
        df["gapped_depth"] = df.sum(axis=1)
        df["depth"] = df[bases].sum(axis=1)
        df["consensus"] = df[bases].idxmax(axis=1)
        df["consensus_counts"] = df.lookup(df.index, df.consensus)
        df["consensus_support"] = df.consensus_counts / df.depth
        df["offconsensus_counts"] = df.depth - df.consensus_counts
        df.loc[df.consensus_counts == 0, "consensus"] = "N"
        df["region_id"] = region_idx

        # replace "gap" consensus positions with + character
        alignment = global_align(re.sub("-", "+", consensus), barcode_sequence)
        E.info("alignment: consensus {}".format(alignment[0]))
        E.info("alignment: barcode   {}".format(alignment[1]))

        barcode_idx = 0
        deleted_barcode_bases = []
        rows = []
        for c, b in zip(*alignment):
            if c == "-":
                deleted_barcode_bases.append(barcode_idx)
                barcode_idx += 1
            elif b == "N":
                rows.append((barcode_idx, "variable"))
                barcode_idx += 1
            elif b == "-":
                rows.append(("", "insertion"))
            elif b == c:
                rows.append((barcode_idx, "fixed-match"))
                barcode_idx += 1
            else:
                rows.append((barcode_idx, "fixed-mismatch"))
                barcode_idx += 1

        alignment_df = pandas.DataFrame.from_records(
            rows, columns=["barcode_pos", "barcode_class"])

        assert len(alignment_df) == len(df)
        df = pandas.concat([df, alignment_df], axis=1)
        with E.open_output_file("pileup") as outf:
            df.to_csv(outf, sep="\t", index=True, index_label="position")

        observed_barcode_sequence = "".join(
            df[df.barcode_class == "variable"].consensus)
        headers = df.consensus_support.describe().index
        eval_df = df.loc[df.barcode_class.isin(
            ("variable", "fixed-match", "fixed-mismatch")), ]
        median_consensus_depth = eval_df.consensus_counts.median()
        # zero stuff out if depth is low
        if median_consensus_depth <= 2:
            deleted_barcode_bases = []

        outf = options.stdout
        # modules to recover partial bar-codes
        outf.write("\t".join(
            map(str, [
                "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases"
            ] + ["support_{}".format(x)
                 for x in headers] + ["counts_{}".format(x) for x in headers] +
                ["offcounts_{}".format(x) for x in headers])) + "\n")

        outf.write("\t".join(
            map(str, [
                observed_barcode_sequence,
                len(deleted_barcode_bases), ",".join(
                    map(str, deleted_barcode_bases))
            ] + eval_df.consensus_support.describe().tolist() +
                eval_df.consensus_counts.describe().tolist() +
                eval_df.offconsensus_counts.describe().tolist())) + "\n")

    E.stop()
Пример #5
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--session",
                      dest="session",
                      type="string",
                      help="load session before creating plots "
                      "[%default]")

    parser.add_option("-d",
                      "--snapshot-dir",
                      dest="snapshotdir",
                      type="string",
                      help="directory to save snapshots in [%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("png", "eps", "svg"),
                      help="output file format [%default]")

    parser.add_option("-o",
                      "--host",
                      dest="host",
                      type="string",
                      help="host that IGV is running on [%default]")

    parser.add_option("-p",
                      "--port",
                      dest="port",
                      type="int",
                      help="port that IGV listens at [%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend each interval by a number of bases "
                      "[%default]")

    parser.add_option("-x",
                      "--expand",
                      dest="expand",
                      type="float",
                      help="expand each region by a certain factor "
                      "[%default]")

    parser.add_option("--session-only",
                      dest="session_only",
                      action="store_true",
                      help="plot session after opening, "
                      "ignore intervals "
                      "[%default]")

    parser.add_option("-n",
                      "--name",
                      dest="name",
                      type="choice",
                      choices=("bed-name", "increment"),
                      help="name to use for snapshot "
                      "[%default]")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
        name="bed-name",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    igv_process = None
    if options.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=options.command, port=options.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (options.host, options.port))
    E.info("saving images in %s" % options.snapshotdir)
    igv = IGV(host=options.host,
              port=options.port,
              snapshot_dir=os.path.abspath(options.snapshotdir))

    if options.session:
        E.info('loading session from %s' % options.session)
        igv.load(options.session)
        E.info('loaded session')

    if options.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(options.session), options.format)
        E.info("writing snapshot to '%s'" %
               os.path.join(options.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in pysam.tabix_iterator(options.stdin, parser=pysam.asBed()):

            c.input += 1

            # IGV can not deal with white-space in filenames
            if options.name == "bed-name":
                name = re.sub("\s", "_", bed.name)
            elif options.name == "increment":
                name = str(c.input)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = options.extend
            if options.expand:
                d = end - start
                extend = max(extend, (options.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = E.get_output_file("%s.%s" % (name, options.format))
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not options.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.stop()
Пример #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
        """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "geneprofile",
                          "tssprofile",
                          "utrprofile",
                          "intervalprofile",
                          "midpointprofile",
                          "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "separateexonprofile",
                          "separateexonprofilewithintrons",
                      ),
                      help='counters to use. Counters describe the '
                      'meta-gene structure to use. '
                      'Note using geneprofilewithintrons, or '
                      'geneprofileabsolutedistancefromthreeprimeend will '
                      'automatically turn on the --use-base-accuracy option'
                      '[%default].')

    parser.add_option("-b",
                      "--bam-file",
                      "--bedfile",
                      "--bigwigfile",
                      dest="infiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="BAM/bed/bigwig files to use. Do not mix "
                      "different types [%default]")

    parser.add_option("-c",
                      "--control-bam-file",
                      dest="controlfiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="control/input to use. Should be of the same "
                      "type as the bam/bed/bigwig file"
                      " [%default]")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option("--normalize-transcript",
                      dest="transcript_normalization",
                      type="choice",
                      choices=("none", "max", "sum", "total-max", "total-sum"),
                      help="normalization to apply on each transcript "
                      "profile before adding to meta-gene profile. "
                      "[%default]")

    parser.add_option("--normalize-profile",
                      dest="profile_normalizations",
                      type="choice",
                      action="append",
                      choices=("all", "none", "area", "counts", "background"),
                      help="normalization to apply on meta-gene "
                      "profile normalization. "
                      "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option("-i",
                      "--shift-size",
                      dest="shifts",
                      type="int",
                      action="append",
                      help="shift reads in :term:`bam` formatted file "
                      "before computing densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-a",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge pairs in :term:`bam` formatted "
                      "file before computing "
                      "densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-u",
                      "--use-base-accuracy",
                      dest="base_accuracy",
                      action="store_true",
                      help="compute densities with base accuracy. The default "
                      "is to only use the start and end of the aligned region "
                      "(RNA-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extends",
                      type="int",
                      action="append",
                      help="extend reads in :term:`bam` formatted file "
                      "(ChIP-Seq). "
                      "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-first-exon",
                      dest="resolution_first",
                      type="int",
                      help="resolution of first exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-last-exon",
                      dest="resolution_last",
                      type="int",
                      help="resolution of last exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option("--resolution-exons-absolute-distance-topolya",
                      dest="resolution_exons_absolute_distance_topolya",
                      type="int",
                      help="resolution of exons absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--resolution-introns-absolute-distance-topolya",
                      dest="resolution_introns_absolute_distance_topolya",
                      type="int",
                      help="resolution of introns absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--extension-exons-absolute-distance-topolya",
                      dest="extension_exons_absolute_distance_topolya",
                      type="int",
                      help="extension for exons from the absolute "
                      "distance from the topolya in bp "
                      "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help="extension for introns from the absolute distance from "
        "the topolya in bp [%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and foreground data. "
        "Computed from data if not set. "
        "[%default]")

    parser.add_option("--output-all-profiles",
                      dest="output_all_profiles",
                      action="store_true",
                      help="keep individual profiles for each "
                      "transcript and output. "
                      "[%default]")

    parser.add_option("--counts-tsv-file",
                      dest="input_filename_counts",
                      type="string",
                      help="filename with count data for each transcript. "
                      "Use this instead "
                      "of recomputing the profile. Useful for plotting the "
                      "meta-gene profile "
                      "from previously computed counts "
                      "[%default]")

    parser.add_option(
        "--background-region-bins",
        dest="background_region_bins",
        type="int",
        help="number of bins on either end of the profile "
        "to be considered for background meta-gene normalization "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        # 3kb is a good balance of seeing long enough 3 prime bias and not omit
        # too many genes. Tim 31th Aug 2013
        resolution_exons_absolute_distance_topolya=3000,
        # introns is only for assess the noise level, thus do ont need a long
        # region, a long region has the side effect of omit more genes. Tim
        # 31th Aug 2013
        resolution_introns_absolute_distance_topolya=500,
        # extension can simply just be the same as resolution
        extension_exons_absolute_distance_topolya=3000,
        extension_introns_absolute_distance_topolya=500,
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        resolution_first=1000,
        resolution_last=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        transcript_normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region_bins=10,
        input_filename_counts=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = IOTools.open_file(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the
        # spliced out introns or exons appear to be covered by
        # non-existent reads, it is better you let those methods imply
        # --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.AlignmentFile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.shifts or options.extends:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            range_counter = geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            range_counter = geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      str(options.infiles))

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            # options.extension_exons_absolute_distance_tostartsite,
            # options.extension_introns_absolute_distance_tostartsite,
            # Tim 31th Aug 2013: a possible feature for future,  if five prime
            # bias is of your interest.
            # (you need to create another class). It is not very difficult to
            # derive from this class, but is not implemented yet
            # This future feature is slightly different the TSS profile
            # already implemented, because in this future feature introns are
            # skipped,
            counters.append(
                geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter, options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    options.extension_upstream, options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                geneprofile.TSSCounter(range_counter,
                                       options.extension_outward,
                                       options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                geneprofile.RegionCounter(range_counter,
                                          options.resolution_upstream,
                                          options.resolution_cds,
                                          options.resolution_downstream,
                                          options.extension_upstream,
                                          options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                geneprofile.MidpointCounter(range_counter,
                                            options.resolution_upstream,
                                            options.resolution_downstream,
                                            options.extension_upstream,
                                            options.extension_downstream))

        # add new method to split 1st and last exons out
        # requires a representative transcript for reach gene
        # gtf should be sorted gene-position
        elif method == "separateexonprofile":
            counters.append(
                geneprofile.SeparateExonCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream))

        elif method == "separateexonprofilewithintrons":
            counters.append(
                geneprofile.SeparateExonWithIntronCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.transcript_normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                IOTools.open_file(
                    E.get_output_file(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(IOTools.open_file(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        feature_names = geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        profiles = []
        for norm in options.profile_normalizations:
            # build matrix, apply normalization
            profile = counter.getProfile(
                normalize=norm,
                background_region_bins=options.background_region_bins)
            profiles.append(profile)

        for x in range(1, len(profiles)):
            assert profiles[0].shape == profiles[x].shape

        # build a single matrix of all profiles for output
        matrix = numpy.concatenate(profiles)
        matrix.shape = len(profiles), len(profiles[0])
        matrix = matrix.transpose()

        with IOTools.open_file(
                E.get_output_file(counter.name) + ".matrix.tsv.gz",
                "w") as outfile:
            outfile.write("bin\tregion\tregion_bin\t%s\n" %
                          "\t".join(options.profile_normalizations))
            fields = []
            bins = []
            for field, nbins in zip(counter.fields, counter.nbins):
                fields.extend([field] * nbins)
                bins.extend(list(range(nbins)))

            for row, cols in enumerate(zip(fields, bins, matrix)):
                outfile.write("%i\t%s\t" %
                              (row, "\t".join([str(x) for x in cols[:-1]])))
                outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]])))

        with IOTools.open_file(
                E.get_output_file(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile",
                          "separateexonprofile",
                          "separateexonprofilewithintrons"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(6, 1, x + 1)
                    plt.plot(list(range(len(counts))), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.get_output_file(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(list(range(len(points))), points)

                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.get_output_file(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.get_output_file(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.get_output_file(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

    # write footer and output benchmark information.
    E.stop()
Пример #7
0
def plot_depth_profile_plot(dataframe, section, map_key2label={}, **kwargs):

    ax = DepthProfilePlot()(dataframe, map_sample2label={})
    plt.savefig(E.get_output_file(section))
    plt.close()
Пример #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--exclusive-overlap",
        dest="exclusive",
        action="store_true",
        help="Intervals reported will be merged across the "
        "positive set and do not overlap any interval in any of the "
        "other sets [default=%default].")

    parser.add_option("-p",
                      "--pattern-identifier",
                      dest="pattern_id",
                      type="string",
                      help="pattern to convert a filename "
                      "to an id [default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("merged-combinations", "unmerged-combinations"),
                      help="method to perform [default=%default]")

    parser.set_defaults(
        pattern_id="(.*).bed.gz",
        exclusive=False,
        method="merged-combinations",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    tags, bedfiles = [], []
    for infile in args:
        bedfiles.append(pysam.Tabixfile(infile, "r"))
        tags.append(re.search(options.pattern_id, infile).groups()[0])

    indices = list(range(len(bedfiles)))
    is_exclusive = options.exclusive

    if options.method == "merged-combinations":

        if is_exclusive:
            start = 1
        else:
            start = 2

        options.stdout.write("combination\twithout\tcounts\n")

        for ncombinants in range(start, len(bedfiles) + 1):
            for combination in itertools.combinations(indices, ncombinants):
                other = [x for x in indices if x not in combination]
                tag = ":".join([tags[x] for x in combination])
                E.debug("combination %s started" % tag)
                E.debug("other: %s" % ":".join([tags[x] for x in other]))

                other_bed = [bedfiles[x] for x in other]
                outf = IOTools.open_file(E.get_output_file(tag),
                                         "w",
                                         create_dir=True)
                c = E.Counter()
                for contig, start, end in combineMergedIntervals(
                    [bedfiles[x] for x in combination]):
                    c.found += 1
                    if is_exclusive and isContainedInOne(
                            contig, start, end, other_bed):
                        c.removed += 1
                        continue
                    c.output += 1
                    outf.write("%s\t%i\t%i\n" % (contig, start, end))

                outf.close()
                E.info("combination %s finished: %s" % (tag, c))

                options.stdout.write("%s\t%s\t%i\n" %
                                     (":".join([tags[x] for x in combination]),
                                      ":".join([tags[x]
                                                for x in other]), c.output))

    elif options.method == "unmerged-combinations":
        options.stdout.write("track\tcombination\twithout\tcounts\n")

        for foreground in indices:

            start = 0

            background = [x for x in indices if x != foreground]
            for ncombinants in range(0, len(background) + 1):
                for combination in itertools.combinations(
                        background, ncombinants):
                    other = [x for x in background if x not in combination]
                    combination_bed = [bedfiles[x] for x in combination]
                    other_bed = [bedfiles[x] for x in other]
                    tag = ":".join([tags[foreground]] +
                                   [tags[x] for x in combination])

                    E.debug("fg=%i, combination=%s, other=%s" %
                            (foreground, combination, other))
                    E.debug("combination %s started" % tag)
                    E.debug("other: %s" % ":".join([tags[x] for x in other]))

                    outf = IOTools.open_file(E.get_output_file(tag),
                                             "w",
                                             create_dir=True)
                    c = E.Counter()
                    for bed in combineUnmergedIntervals(
                            bedfiles[foreground], combination_bed):
                        c.found += 1
                        if is_exclusive and isContainedInOne(
                                bed.contig, bed.start, bed.end, other_bed):
                            c.removed += 1
                            continue
                        c.output += 1
                        outf.write("%s\n" % str(bed))

                    outf.close()
                    E.info("combination %s finished: %s" % (tag, c))

                    options.stdout.write(
                        "%s\t%s\t%s\t%i\n" % (tags[foreground], ":".join([
                            tags[x] for x in combination
                        ]), ":".join([tags[x] for x in other]), c.output))

    E.stop()