Exemplo n.º 1
0
    def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None):
        num_cluster = {}
        best_id = {}
        out = open(pwm, "w")
        for i, (clus, singles) in enumerate(clusters):
            motifs = [clus] + singles
            tmp = NamedTemporaryFile(dir=mytmpdir())
            tmp2 = NamedTemporaryFile(dir=mytmpdir())
            for m in motifs:
                tmp.write("%s\n" % m.to_pwm())
            tmp.flush()
            auc,mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name)
            bla = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))
            for m in bla:
                self.logger.debug("sorted: %s %s %s" % (str(m), mncp[m.id], auc[m.id]))

            self.logger.debug("end list")
            best_motif = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))[-1]
            old_id = best_motif.id
            best_motif.id = "GimmeMotifs_%d" % (i + 1)
            best_id[best_motif.id] = old_id.split("_")[0]
            num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles)
            if imgdir:
                best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG")
            out.write("%s\n" % best_motif.to_pwm())
            tmp.close()
            tmp2.close()
        out.close()
        return num_cluster, best_id
Exemplo n.º 2
0
def match_plot(plotdata, outfile):
    """Plot list of motifs with database match and p-value
    "param plotdata: list of (motif, dbmotif, pval)
    """
    fig_h = 2
    fig_w = 7

    nrows = len(plotdata)
    ncols = 2
    fig = plt.figure(figsize=(fig_w, nrows * fig_h))

    for i, (motif, dbmotif, pval) in enumerate(plotdata):
        text = "Motif: %s\nBest match: %s\np-value: %0.2e" % (motif.id, dbmotif.id, pval)

        grid = ImageGrid(fig, (nrows, ncols, i * 2 + 1), nrows_ncols=(2, 1), axes_pad=0)

        for j in range(2):
            axes_off(grid[j])

        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png")
        motif.to_img(tmp.name, format="PNG", height=6)
        grid[0].imshow(plt.imread(tmp.name), interpolation="none")
        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png")
        dbmotif.to_img(tmp.name, format="PNG")
        grid[1].imshow(plt.imread(tmp.name), interpolation="none")

        ax = plt.subplot(nrows, ncols, i * 2 + 2)
        axes_off(ax)

        ax.text(0, 0.5, text, horizontalalignment="left", verticalalignment="center")

    plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close(fig)
Exemplo n.º 3
0
def _run_tool(job_name, t, fastafile, params):
    try:
        result = t.run(fastafile, ".", params, mytmpdir())
    except Exception as e:
        result = ([], "", "{} failed to run: {}".format(job_name, e))
    
    return job_name, result
Exemplo n.º 4
0
def _run_tool(job_name, t, fastafile, params):
    try:
        result = t.run(fastafile, ".", params, mytmpdir())
    except Exception as e:
        result = ([], "", "{} failed to run: {}".format(job_name, e))

    return job_name, result
Exemplo n.º 5
0
def _run_tool(job_name, t, fastafile, params):
    """Parallel motif prediction."""
    try:
        result = t.run(fastafile, params, mytmpdir())
    except Exception as e:
        result = ([], "", "{} failed to run: {}".format(job_name, e))
    
    return job_name, result
Exemplo n.º 6
0
def _run_tool(job_name, t, fastafile, params):
    """Parallel motif prediction."""
    try:
        result = t.run(fastafile, params, mytmpdir())
    except Exception as e:
        result = ([], "", "{} failed to run: {}".format(job_name, e))

    return job_name, result
Exemplo n.º 7
0
    def __init__(self, matchfile, genome="hg19", number=None, size=None):
        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Exemplo n.º 8
0
    def __init__(self, genome, size=None, n=None):
        size = int(size)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        create_random_genomic_bedfile(tmpbed, genome, size, n)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Exemplo n.º 9
0
    def _determine_best_motif_in_cluster(self,
                                         clusters,
                                         pwm,
                                         sample_fa,
                                         bg_fa,
                                         imgdir=None):
        num_cluster = {}
        best_id = {}
        out = open(pwm, "w")
        for i, (clus, singles) in enumerate(clusters):
            best_motif = clus
            if len(singles) > 1:
                motifs = [clus] + singles
                tmp = NamedTemporaryFile(dir=mytmpdir())
                tmp2 = NamedTemporaryFile(dir=mytmpdir())
                for m in motifs:
                    tmp.write("%s\n" % m.to_pwm())
                tmp.flush()
                auc, mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa,
                                              tmp2.name)
                bla = sorted(motifs,
                             cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))
                for m in bla:
                    self.logger.debug("sorted: %s %s %s", str(m), mncp[m.id],
                                      auc[m.id])

                self.logger.debug("end list")

                best_motif = sorted(
                    motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))[-1]
                tmp.close()
                tmp2.close()
            old_id = best_motif.id
            best_motif.id = "GimmeMotifs_%d" % (i + 1)
            best_id[best_motif.id] = old_id.split("_")[0]
            num_cluster["%s_%s" % (best_motif.id,
                                   best_motif.to_consensus())] = len(singles)
            if imgdir:
                best_motif.to_img(os.path.join(imgdir, best_motif.id),
                                  format="PNG")
            out.write("%s\n" % best_motif.to_pwm())
        out.close()
        return num_cluster, best_id
Exemplo n.º 10
0
def match_plot(plotdata, outfile):
    """Plot list of motifs with database match and p-value
    "param plotdata: list of (motif, dbmotif, pval)
    """
    fig_h = 2
    fig_w = 7

    nrows = len(plotdata)
    ncols = 2
    fig = plt.figure(figsize=(fig_w, nrows * fig_h))

    for i, (motif, dbmotif, pval) in enumerate(plotdata):
        text = "Motif: %s\nBest match: %s\np-value: %0.2e" % (
            motif.id,
            dbmotif.id,
            pval,
        )

        grid = ImageGrid(fig, (nrows, ncols, i * 2 + 1),
                         nrows_ncols=(2, 1),
                         axes_pad=0)

        for j in range(2):
            axes_off(grid[j])

        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png", delete=False)
        motif.plot_logo(fname=tmp.name, title=False)
        grid[0].imshow(plt.imread(tmp.name), interpolation="none")
        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png", delete=False)
        dbmotif.plot_logo(fname=tmp.name, title=False)
        grid[1].imshow(plt.imread(tmp.name), interpolation="none")

        ax = plt.subplot(nrows, ncols, i * 2 + 2)
        axes_off(ax)

        ax.text(0,
                0.5,
                text,
                horizontalalignment="left",
                verticalalignment="center")

    plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close(fig)
Exemplo n.º 11
0
    def __init__(self, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None):
        length = int(length)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        create_random_genomic_bedfile(tmpbed, index, length, n)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta, use_strand=True)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Exemplo n.º 12
0
    def to_img(self, fname, fmt="EPS", add_left=0, seqlogo=None, height=6):
        """ Valid formats EPS, GIF, PDF, PNG """
        if not seqlogo:
            seqlogo = self.seqlogo
        if not seqlogo:
            raise ValueError("seqlogo not specified or configured")
        
        #TODO: split to_align function
        
        VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"]
        N = 1000
        fmt = fmt.upper()
        if not fmt in VALID_FORMATS:
            sys.stderr.write("Invalid motif format\n")
            return
        
        if fname[-4:].upper() == (".%s" % fmt):
            fname = fname[:-4]
        seqs = []
        if add_left == 0:
            seqs = ["" for i in range(N)]
        else:
            for nuc in ["A", "C", "T", "G"]:
                seqs += [nuc * add_left for i in range(N // 4)]

        for pos in range(len(self.pwm)):
            vals = [self.pwm[pos][0] * N]
            for i in range(1,4):
                vals.append(vals[i-1] + self.pwm[pos][i] * N)
            if vals[3] - N != 0:
                #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100)
                vals[3] = N
            for i in range(N):
                if i <= vals[0]:
                    seqs[i] += "A"
                elif i <= vals[1]:
                    seqs[i] += "C"
                elif i <= vals[2]:
                    seqs[i] += "G"
                elif i <= vals[3]:
                    seqs[i] += "T"
    
        f = NamedTemporaryFile(mode="w", dir=mytmpdir())
        for seq in seqs:
            f.write("%s\n" % seq)
        f.flush()
        makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" 
        cmd = makelogo.format(
                              seqlogo, 
                              f.name, 
                              fmt, 
                              height,
                              len(self) + add_left, 
                              fname)
        sp.call(cmd, shell=True)
Exemplo n.º 13
0
    def to_img(self, fname, format="EPS", add_left=0, seqlogo=None, height=6):
        """ Valid formats EPS, GIF, PDF, PNG """
        if not seqlogo:
            seqlogo = self.seqlogo
        if not seqlogo:
            raise ValueError("seqlogo not specified or configured")
        
        #TODO: split to_align function
        
        VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"]
        N = 1000
        format = format.upper()
        if not format in VALID_FORMATS:
            sys.stderr.write("Invalid motif format\n")
            return
        
        if fname[-4:].upper() == (".%s" % format):
            fname = fname[:-4]
        seqs = []
        if add_left == 0:
            seqs = ["" for i in range(N)]
        else:
            for nuc in ["A", "C", "T", "G"]:
                seqs += [nuc * add_left for i in range(N / 4)]

        for pos in range(len(self.pwm)):
            vals = [self.pwm[pos][0] * N]
            for i in range(1,4):
                vals.append(vals[i-1] + self.pwm[pos][i] * N)
            if vals[3] - N != 0:
                #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100)
                vals[3] = N
            for i in range(N):
                if i <= vals[0]:
                    seqs[i] += "A"
                elif i <= vals[1]:
                    seqs[i] += "C"
                elif i <= vals[2]:
                    seqs[i] += "G"
                elif i <= vals[3]:
                    seqs[i] += "T"
    
        f = NamedTemporaryFile(dir=mytmpdir())
        for seq in seqs:
            f.write("%s\n" % seq)
        f.flush()
        makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" 
        cmd = makelogo.format(
                              seqlogo, 
                              f.name, 
                              format, 
                              height,
                              len(self) + add_left, 
                              fname)
        sp.call(cmd, shell=True)
Exemplo n.º 14
0
    def __init__(self, matchfile, genome="hg19", number=None):
        config = MotifConfig()
        index = os.path.join(config.get_index_dir(), genome)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Exemplo n.º 15
0
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools={}, job_server="", ncpus=8, logger=None, max_time=None, fg_file=None, bg_file=None):
    
    config = MotifConfig()

    if not tools:
        tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")])
    
    if not logger:
        logger = logging.getLogger('prediction.pp_predict_motifs')

    wmin = 5 
    step = 1
    if analysis in ["large","xl"]:
        step = 2
        wmin = 6
    
    analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20}
    wmax = analysis_max[analysis]

    if analysis == "xs":
        sys.stderr.write("Setting analysis xs to small")
        analysis = "small"

    if not job_server:
        job_server = pp.Server(ncpus, secret='pumpkinrisotto')
    
    jobs = {}
    
    result = PredictionResult(outfile, logger=logger, fg_file=fg_file, bg_file=bg_file, job_server=job_server)
    
    # Dynamically load all tools
    toolio = [x[1]() for x in inspect.getmembers(
                                                tool_classes, 
                                                lambda x: 
                                                        inspect.isclass(x) and 
                                                        issubclass(x, tool_classes.MotifProgram)
                                                ) if x[0] != 'MotifProgram']
    
    # TODO:
    # Add warnings for running time: Weeder, GADEM
        
    ### Add all jobs to the job_server ###
    params = {'analysis': analysis, 'background':background, "single":single, "organism":organism}
    for t in toolio:
        if tools.has_key(t.name) and tools[t.name]:
            if t.use_width:
                for i in range(wmin, wmax + 1, step):
                    logger.debug("Starting %s job, width %s" % (t.name, i))
                    job_name = "%s_width_%s" % (t.name, i)
                    params['width'] = i
                    jobs[job_name] = job_server.submit(
                        t.run, 
                        (fastafile, ".", params, mytmpdir()), 
                        (tool_classes.MotifProgram,),
                        ("gimmemotifs.config",),  
                        result.add_motifs, 
                        (job_name,))
            else:
                logger.debug("Starting %s job" % t.name)
                job_name = t.name
                jobs[job_name] = job_server.submit(
                    t.run, 
                    (fastafile, ".", params, mytmpdir()), 
                    (tool_classes.MotifProgram,),
                    ("gimmemotifs.config",), 
                    result.add_motifs, 
                    (job_name,))
        else:
            logger.debug("Skipping %s" % t.name)
    
    ### Wait until all jobs are finished or the time runs out ###
    start_time = time()    
    try:
        # Run until all jobs are finished
        while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time):
            pass
        if len(result.finished) < len(jobs.keys()):
            logger.info("Maximum allowed running time reached, destroying remaining jobs")
            job_server.destroy()
            result.get_remaining_stats()
    ### Or the user gets impatient... ###
    except KeyboardInterrupt, e:
        # Destroy all running jobs
        logger.info("Caught interrupt, destroying all running jobs")
        job_server.destroy()
        result.get_remaining_stats()
Exemplo n.º 16
0
    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("starting full motif analysis")
        self.logger.debug("Using temporary directory {0}".format(mytmpdir()))

        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)

        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.debug("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.debug("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]

        self.logger.debug("Parameters:")
        for param, value in params.items():
            self.logger.debug("  %s: %s", param, value)

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.debug("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except Exception:
            # Leave it to BED
            pass

        index_msg = ("No index found for genome {}! "
                     "Has GimmeMotifs been configured correctly and is the "
                     "genome indexed?").format(params["genome"])
        index_dir = os.path.join(self.config.get_index_dir(), params["genome"])

        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info(
                        "Input type is FASTA, can't use background type '%s'",
                        bg)
                if bg == "genomic":
                    if not os.path.exists(index_dir):
                        self.logger.error(index_msg)
                        sys.exit(1)
            background = [bg for bg in background if bg in FA_VALID_BGS]

        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            if not os.path.exists(index_dir):
                self.logger.error(index_msg)
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)  # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info(
                        "Input type is BED, can't use background type '%s'",
                        bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]

        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except Exception:
                self.logger.debug(
                    "Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.debug(
                    "Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.debug(
                    "Invalid time limit for motif prediction, setting to no limit"
                )
                self.max_time = None
        else:
            self.logger.debug("No time limit for motif prediction")

        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"],
                                   params["width"], params["fraction"],
                                   params["abs_max"], params["use_strand"])

            # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(),
                                     params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir,
                                     self.validation_bed,
                                     self.location_fa,
                                     extend_up=extend,
                                     extend_down=extend,
                                     use_strand=params["use_strand"],
                                     ignore_missing=True)

        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"],
                                  params["fraction"], params["abs_max"])

            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0])
            all_same_width = not (False
                                  in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn(
                    "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!"
                )

        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x
                       in [y.strip() for y in params["tools"].split(",")])
                      for x in params["available_tools"].split(",")])

        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("starting motif prediction (%s)", analysis)
        self.logger.info("tools: %s",
                         ", ".join([x for x in tools.keys() if tools[x]]))

        bg_file = self.bg_file["fa"][sorted(
            background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.debug("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa,
                                   self.predicted_pfm,
                                   analysis,
                                   params["genome"],
                                   params["use_strand"],
                                   self.prediction_bg,
                                   tools,
                                   self.job_server(),
                                   logger=self.logger,
                                   max_time=self.max_time,
                                   fg_file=self.validation_fa,
                                   bg_file=bg_file)

        motifs = result.motifs
        self.logger.info("predicted %s motifs", len(motifs))
        self.logger.debug("written to %s", self.predicted_pfm)

        if len(motifs) == 0:
            self.logger.info("no motifs found")
            sys.exit()

        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))

        self.logger.debug(result.stats)

        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write(
                    "%s\t%s\n" %
                    (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error(
                    "No stats for motif {0}, skipping this motif!".format(
                        motif.id))
                motifs.remove(motif)
        f.close()

        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0], 1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" %
                                   (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]

            f.write("%s\t%s\t%s\n" %
                    (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" %
                    (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()

        #self.logger.debug("RANK: %s" % stat)
        #self.logger.debug("\t".join([str(x) for x in names]))
        #self.logger.debug("\t".join([str(x) for x in vals]))
        #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[
                    'enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()
        self.logger.info("%s motifs are significant", nsig)
        self.logger.debug("written to %s", self.significant_pfm)

        if nsig == 0:
            self.logger.info("no significant motifs found")
            return

        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa,
                              self.bg_file["fa"][bg], self.bg_file["roc"][bg])

        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm,
                                        self.outdir,
                                        params["cluster_threshold"])

        # Determine best motif in cluster

        num_cluster, best_id = self._determine_best_motif_in_cluster(
            clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)

        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -

        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp,
                             logger=self.logger,
                             job_server=self.server,
                             fg_file=self.validation_fa,
                             bg_file=bg_file,
                             do_counter=False)
        p.add_motifs(
            ("clustering", (read_motifs(open(self.final_pwm)), "", "")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        #print "p.stats"
        #print p.stats
        #print "num_cluster"
        #print num_cluster
        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],
            "roc_auc": [0.6, 0.75, 0.9],
            "maxenr": [10, 20, 30],
            "enr_fdr": [4, 8, 12],
            "fraction": [0.4, 0.6, 0.8],
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        self.logger.info("creating report")

        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa,
                                  self.bg_file["fa"][bg], bg)

        # Location plots
        self.logger.debug("Creating localization plots")
        motifs = read_motifs(open(self.final_pwm), fmt="pwm")
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa,
                               motif,
                               lwidth,
                               outfile,
                               cutoff=s["cutoff_fdr"])

            s["stars"] = int(
                mean([star(s[x], all_stats[x])
                      for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm,
                            background,
                            stats=p.stats,
                            best_id=best_id)
        self._create_text_report(self.final_pwm, background)

        self.logger.info("finished")
        self.logger.info("output dir: %s", os.path.split(self.motif_report)[0])
        self.logger.info("report: %s", os.path.split(self.motif_report)[-1])
        #self.logger.info("Open %s in your browser to see your results." % (self.motif_report))

        if not (params["keep_intermediate"]):

            self.logger.debug(
                "Deleting intermediate files. Please specifify the -k option if you want to keep these files."
            )
            shutil.rmtree(self.tmpdir)

        self.logger.debug("Done")

        return self.motif_report
Exemplo n.º 17
0
    def to_img(self, fname, fmt="PNG", add_left=0, seqlogo=None, height=6):
        """Create a sequence logo using seqlogo.

        Create a sequence logo and save it to a file. Valid formats are: PNG, 
        EPS, GIF and PDF. 

        Parameters
        ----------
        fname : str
            Output filename.
        fmt : str , optional
            Output format (case-insensitive). Valid formats are PNG, EPS, GIF 
            and PDF.
        add_left : int , optional
            Pad motif with empty positions on the left side.
        seqlogo : str
            Location of the seqlogo executable. By default the seqlogo version 
            that is included with GimmeMotifs is used.
        height : float
            Height of the image
        """
        if not seqlogo:
            seqlogo = self.seqlogo
        if not seqlogo:
            raise ValueError("seqlogo not specified or configured")
        
        #TODO: split to_align function
        
        VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"]
        N = 1000
        fmt = fmt.upper()
        if not fmt in VALID_FORMATS:
            sys.stderr.write("Invalid motif format\n")
            return
        
        if fname[-4:].upper() == (".%s" % fmt):
            fname = fname[:-4]
        seqs = []
        if add_left == 0:
            seqs = ["" for i in range(N)]
        else:
            for nuc in ["A", "C", "T", "G"]:
                seqs += [nuc * add_left for i in range(N // 4)]

        for pos in range(len(self.pwm)):
            vals = [self.pwm[pos][0] * N]
            for i in range(1,4):
                vals.append(vals[i-1] + self.pwm[pos][i] * N)
            if vals[3] - N != 0:
                #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100)
                vals[3] = N
            for i in range(N):
                if i <= vals[0]:
                    seqs[i] += "A"
                elif i <= vals[1]:
                    seqs[i] += "C"
                elif i <= vals[2]:
                    seqs[i] += "G"
                elif i <= vals[3]:
                    seqs[i] += "T"
    
        f = NamedTemporaryFile(mode="w", dir=mytmpdir())
        for seq in seqs:
            f.write("%s\n" % seq)
        f.flush()
        makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" 
        cmd = makelogo.format(
                              seqlogo, 
                              f.name, 
                              fmt, 
                              height,
                              len(self) + add_left, 
                              fname)
        sp.call(cmd, shell=True)
Exemplo n.º 18
0
    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("Starting full motif analysis")
        self.logger.info("Using temporary directory {0}".format(mytmpdir()))
    
        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)
        
        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.info("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.info("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]
        
        self.logger.info("Parameters:")
        for param, value in params.items():
            self.logger.info("  %s: %s" % (param, value))

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.info("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except:
            # Leave it to BED
            pass

        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info("Input type is FASTA, can't use background type '%s'" % bg)
            background = [bg for bg in background if bg in FA_VALID_BGS]
            
        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
            if not os.path.exists(index_dir):
                self.logger.error("No index found for genome %s! Has GimmeMotifs been configured correctly and is the genome indexed?" % params["genome"])
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)    # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info("Input type is BED, can't use background type '%s'" % bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]
    
        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except:
                self.logger.info("Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.info("Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.info("Invalid time limit for motif prediction, setting to no limit")
                self.max_time = None
        else:
                self.logger.info("No time limit for motif prediction")
            
        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"])
        
        
             # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True)
        
        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"])
        
            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0]) 
            all_same_width = not(False in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!")
        
        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x in [y.strip() for y in  params["tools"].split(",")]) for x in params["available_tools"].split(",")])
    
        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("Starting motif prediction (%s) using %s" % 
            (analysis, ", ".join([x for x in tools.keys() if tools[x]])))

        bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.info("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file)
    
        motifs = result.motifs
        self.logger.info("Predicted %s motifs, written to %s" % (len(motifs), self.predicted_pfm))
        
        if len(motifs) == 0:
            self.logger.info("No motifs found. Done.")
            sys.exit()
        
        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))
        print result.stats
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id))
                motifs.remove(motif)
        f.close()
    
        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0],1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]
            
            f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()
            
            
            #self.logger.debug("RANK: %s" % stat)
            #self.logger.debug("\t".join([str(x) for x in names]))
            #self.logger.debug("\t".join([str(x) for x in vals]))
            #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0 
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()        
        self.logger.info("%s motifs are significant, written to %s" % (nsig, self.significant_pfm))
        
        if nsig == 0:
            self.logger.info("No significant motifs found. Done.")
            sys.exit()
        
        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg])
        
        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"])
        
        # Determine best motif in cluster
        num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)
        
        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -
        
        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) 
        p.add_motifs(("Clustering",  (pwmfile_to_motifs(self.final_pwm), "","")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],                
            "roc_auc": [0.6, 0.75, 0.9],    
            "maxenr": [10, 20, 30],         
            "enr_fdr": [4, 8, 12],         
            "fraction": [0.4, 0.6, 0.8],    
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        
        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg)
        
        # Location plots
        self.logger.info("Creating localization plots")
        motifs = pwmfile_to_motifs(self.final_pwm)
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"])
    
            s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report    
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id)
        self._create_text_report(self.final_pwm, background)
        self.logger.info("Open %s in your browser to see your results." % (self.motif_report))
        
        if not(params["keep_intermediate"]):
            
            self.logger.info("Deleting intermediate files. Please specifify the -k option if you want to keep these files.")
            shutil.rmtree(self.tmpdir)

        self.logger.info("Done")
Exemplo n.º 19
0
def gimme_motifs(inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True):
    """De novo motif prediction based on an ensemble of different tools.

    Parameters
    ----------
    inputfile : str
        Filename of input. Can be either BED, narrowPeak or FASTA.

    outdir : str
        Name of output directory.

    params : dict, optional
        Optional parameters.

    filter_significant : bool, optional
        Filter motifs for significance using the validation set.
    
    cluster : bool, optional
        Cluster similar predicted (and significant) motifs.

    create_report : bool, optional
        Create output reports (both .txt and .html).
 
    Returns
    -------
    motifs : list
        List of predicted motifs.     

    Examples
    --------

    >>> from gimmemotifs.denovo import gimme_motifs
    >>> gimme_motifs("input.fa", "motifs.out")
    """
    if outdir is None:
        outdir = "gimmemotifs_{}".format(datetime.date.today().strftime("%d_%m_%Y"))

    # Create output directories
    tmpdir = os.path.join(outdir, "intermediate")
    for d in [outdir, tmpdir]: 
        if not os.path.exists(d):
            os.mkdir(d) 

    # setup logfile
    logger = logging.getLogger("gimme")
    # Log to file
    logfile = os.path.join(outdir, "gimmemotifs.log")
    fh = logging.FileHandler(logfile, "w")
    fh.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(file_formatter)
    logger.addHandler(fh)
    logger = logging.getLogger("gimme.denovo.gimme_motifs")
    
    # Initialize parameters
    params = parse_denovo_params(params)
 
    # Check the input files
    input_type, background = check_denovo_input(inputfile, params)
   
    logger.info("starting full motif analysis")
    logger.debug("Using temporary directory %s", mytmpdir())
    
    
    # Create the necessary files for motif prediction and validation
    if input_type == "bed":
        prepare_denovo_input_bed(inputfile, params, tmpdir)
    elif input_type == "narrowpeak":
        prepare_denovo_input_narrowpeak(inputfile, params, tmpdir)
    elif input_type == "fasta":
        prepare_denovo_input_fa(inputfile, params, tmpdir)
    else:
        
        logger.error("Unknown input file.")
        sys.exit(1)

    # Create the background FASTA files
    background = create_backgrounds(
            tmpdir, 
            background, 
            params.get("genome", None), 
            params["width"], 
            params.get("custom_background", None)
            )
    
    # Predict de novo motifs
    result = predict_motifs(
            os.path.join(tmpdir, "prediction.fa"),
            os.path.join(tmpdir, "prediction.bg.fa"),
            os.path.join(tmpdir, "all_motifs.pfm"),
            params=params,
            stats_fg=os.path.join(tmpdir, 'validation.fa'),
            stats_bg=background, 
            )

    if len(result.motifs) == 0:
        logger.info("finished")
        return []
    
    # Write statistics
    stats_file = os.path.join(tmpdir, "stats.{}.txt")
    write_stats(result.stats, stats_file)

    bg = sorted(background, key=lambda x: BG_RANK[x])[0]
    if filter_significant:
        motifs = filter_significant_motifs(
                os.path.join(tmpdir, "significant_motifs.pfm"),
                result, 
                bg)
        if len(motifs) == 0:
            logger.info("no significant motifs")
            return 

        pwmfile = os.path.join(tmpdir, "significant_motifs.pfm")
    else:
        logger.info("not filtering for significance")
        motifs = result.motifs
        pwmfile = os.path.join(tmpdir, "all_motifs.pfm")

    if cluster: 
        clusters = cluster_motifs_with_report(
                    pwmfile,
                    os.path.join(tmpdir, "clustered_motifs.pfm"),
                    outdir,
                    0.95,
                    title=inputfile)
        
        # Determine best motif in cluster
        best_motifs = best_motif_in_cluster(
                pwmfile,
                os.path.join(tmpdir, "clustered_motifs.pfm"),
                clusters, 
                os.path.join(tmpdir, 'validation.fa'), 
                background, 
                result.stats)
        
        final_motifs, stats = rename_motifs(best_motifs, result.stats)
    else:
        logger.info("not clustering")
        rank = rank_motifs(result.stats)
        sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True)
        final_motifs, stats = rename_motifs(sorted_motifs, result.stats)

    with open(os.path.join(outdir, "motifs.pwm"), "w") as f:
        for m in final_motifs:
            f.write("{}\n".format(m.to_pwm()))
    
    if create_report:
        bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background])

        create_denovo_motif_report(
                inputfile, 
                os.path.join(outdir, "motifs.pwm"), 
                os.path.join(tmpdir, "validation.fa"), 
                bg, 
                os.path.join(tmpdir, "localization.fa"), 
                outdir,
                params,
                stats,
                )
    
    with open(os.path.join(outdir, "params.txt"), "w") as f:
        for k,v in params.items():
            f.write("{}\t{}\n".format(k,v))
    
    if not(params.get("keep_intermediate")):
        logger.debug(
            "Deleting intermediate files. "
            "Please specifify the -k option if you want to keep these files.")
        shutil.rmtree(tmpdir)

    logger.info("finished")
    logger.info("output dir: %s", outdir) 
    if cluster:
        logger.info("report: %s", os.path.join(outdir, "motif_report.html"))

    return final_motifs
Exemplo n.º 20
0
def gimme_motifs(
    inputfile,
    outdir,
    params=None,
    filter_significant=True,
    cluster=True,
    create_report=True,
):
    """De novo motif prediction based on an ensemble of different tools.

    Parameters
    ----------
    inputfile : str
        Filename of input. Can be either BED, narrowPeak or FASTA.

    outdir : str
        Name of output directory.

    params : dict, optional
        Optional parameters.

    filter_significant : bool, optional
        Filter motifs for significance using the validation set.

    cluster : bool, optional
        Cluster similar predicted (and significant) motifs.

    create_report : bool, optional
        Create output reports (both .txt and .html).

    Returns
    -------
    motifs : list
        List of predicted motifs.

    Examples
    --------

    >>> from gimmemotifs.denovo import gimme_motifs
    >>> gimme_motifs("input.fa", "motifs.out")
    """
    if outdir is None:
        outdir = "gimmemotifs_{}".format(
            datetime.date.today().strftime("%d_%m_%Y"))

    # Create output directories
    tmpdir = os.path.join(outdir, "intermediate")
    for d in [outdir, tmpdir]:
        if not os.path.exists(d):
            os.mkdir(d)

    # Log to file
    logger = logging.getLogger("gimme")
    logfile = os.path.join(outdir, "gimmemotifs.log")
    fh = logging.FileHandler(logfile, "w")
    fh.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(file_formatter)
    logger.addHandler(fh)
    logger = logging.getLogger("gimme.denovo")

    # Initialize parameters
    params = parse_denovo_params(params)

    # Check the input files
    input_type, background = check_denovo_input(inputfile, params)

    logger.info("starting full motif analysis")
    logger.debug("Using temporary directory %s", mytmpdir())

    params["size"] = int(params["size"])
    if params["size"] > 0:
        logger.info(
            "using size of {}, set size to 0 to use original region size".
            format(params["size"]))
    else:
        logger.info("using original size")

    # Create the necessary files for motif prediction and validation
    if input_type == "bed":
        logger.info("preparing input from BED")
        prepare_denovo_input_bed(inputfile, params, tmpdir)
    elif input_type == "narrowpeak":
        logger.info("preparing input from narrowPeak")
        prepare_denovo_input_narrowpeak(inputfile, params, tmpdir)
    elif input_type == "fasta":
        logger.info("preparing input from FASTA")
        prepare_denovo_input_fa(inputfile, params, tmpdir)
    else:
        logger.error("unknown input file format!")
        sys.exit(1)

    # Create the background FASTA files
    background = create_backgrounds(
        tmpdir,
        background,
        params.get("genome", None),
        params["size"],
        params.get("custom_background", None),
    )

    # Predict de novo motifs
    result = predict_motifs(
        os.path.join(tmpdir, "prediction.fa"),
        os.path.join(tmpdir, "prediction.bg.fa"),
        os.path.join(tmpdir, "all_motifs.pfm"),
        params=params,
        stats_fg=os.path.join(tmpdir, "validation.fa"),
        stats_bg=background,
    )

    if len(result.motifs) == 0:
        logger.info("finished")
        return []

    # Write statistics
    stats_file = os.path.join(tmpdir, "stats.{}.txt")
    write_stats(result.stats, stats_file)

    bg = sorted(background, key=lambda x: BG_RANK[x])[0]
    if filter_significant:
        motifs = filter_significant_motifs(
            os.path.join(tmpdir, "significant_motifs.pfm"), result, bg)
        if len(motifs) == 0:
            logger.info("no significant motifs")
            return

        pfmfile = os.path.join(tmpdir, "significant_motifs.pfm")
    else:
        logger.info("not filtering for significance")
        motifs = result.motifs
        pfmfile = os.path.join(tmpdir, "all_motifs.pfm")

    if cluster:
        clusters = cluster_motifs_with_report(
            pfmfile,
            os.path.join(tmpdir, "clustered_motifs.pfm"),
            outdir,
            0.95,
            title=inputfile,
        )

        # Determine best motif in cluster
        best_motifs = best_motif_in_cluster(
            pfmfile,
            os.path.join(tmpdir, "clustered_motifs.pfm"),
            clusters,
            os.path.join(tmpdir, "validation.fa"),
            background,
            params["genome"],
            result.stats,
        )

        final_motifs, stats = rename_motifs(best_motifs, result.stats)
    else:
        logger.info("not clustering")
        rank = rank_motifs(result.stats)
        sorted_motifs = sorted(motifs,
                               key=lambda x: rank[str(x)],
                               reverse=True)
        final_motifs, stats = rename_motifs(sorted_motifs, result.stats)

    with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
        for m in final_motifs:
            f.write("{}\n".format(m.to_pwm()))

    if create_report:
        bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b)))
                   for b in background])

        create_denovo_motif_report(
            inputfile,
            os.path.join(outdir, "gimme.denovo.pfm"),
            os.path.join(tmpdir, "validation.fa"),
            bg,
            os.path.join(tmpdir, "localization.fa"),
            outdir,
            params,
            stats,
        )

    with open(os.path.join(outdir, "params.txt"), "w") as f:
        for k, v in params.items():
            f.write("{}\t{}\n".format(k, v))

    if not (params.get("keep_intermediate")):
        logger.debug(
            "Deleting intermediate files. "
            "Please specifify the -k option if you want to keep these files.")
        shutil.rmtree(tmpdir)

    logger.info("finished")
    logger.info("output dir: %s", outdir)
    if cluster:
        logger.info("de novo report: %s",
                    os.path.join(outdir, "gimme.denovo.html"))

    return final_motifs
Exemplo n.º 21
0
def diff_plot(
    motifs,
    pwms,
    names,
    freq,
    counts,
    bgfreq,
    bgcounts,
    outfile,
    mindiff=0,
    minenr=3,
    minfreq=0.01,
):
    w_ratio = np.array([14, len(names), len(names) + 1])
    plot_order = [0, 1, 2]

    nbar = 5

    freq = np.array(freq)
    counts = np.array(counts)
    bgfreq = np.array([[x] for x in bgfreq])

    enr = np.log2(np.divide(freq, bgfreq))
    filt = np.ones(len(enr), dtype="bool")
    filters = [
        np.sum(enr > minenr, 1) > 0,
        np.sum(freq > minfreq, 1) > 0,
        (np.max(enr, 1) - np.min(enr, 1)) > mindiff,
        np.sum(counts > 2, 1) > 0,
    ]
    for f in filters:
        filt = np.logical_and(filt, f)

    motifs = np.array(motifs)[filt]
    freq = freq[filt]
    bgfreq = bgfreq[filt]
    enr = enr[filt]

    sys.stderr
    for m, f, b, e in zip(motifs, freq, bgfreq, enr):
        sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format(
            m, "\t".join(str(x) for x in e), "\t".join(str(x) for x in f),
            b[0]))

    if len(freq) == 0:
        sys.stderr.write("No enriched and/or differential motifs found.\n")
        return
    elif len(freq) >= 3:
        z = hier.linkage(freq, method="complete", metric="correlation")
        ind = hier.leaves_list(z)
    else:
        ind = np.arange(len(freq))

    fig = plt.figure(figsize=((5 + 0.75 * len(names)) * 3,
                              (0.3 * len(motifs) + 1.5) * 3))

    gs = GridSpec(
        len(motifs) + 3 + nbar,
        3,
        height_ratios=[1] * nbar + [3] * (len(motifs) + 3),
        width_ratios=w_ratio[plot_order],
    )

    # Colormaps
    c1 = mpl.cm.RdBu
    c2 = mpl.cm.Blues

    # Frequency plot #

    # Create axis
    ax = plt.subplot(gs[nbar:-3, plot_order[2]])

    # Plot frequencies
    vmin = 0
    vmax = 0.3

    pfreq = np.hstack((freq, bgfreq))
    ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax)

    sm = plt.cm.ScalarMappable(cmap=c2, norm=Normalize(vmin=vmin, vmax=vmax))

    # Show percentages
    for y, row in enumerate(pfreq[ind]):
        for x, val in enumerate(row):
            v = vmax
            if val >= (vmin + ((vmax - vmin) / 2)):
                v = vmin
            plt.text(
                x + 0.5,
                y + 0.5,
                "{:.1%}".format(val),
                ha="center",
                va="center",
                color=sm.to_rgba(v),
            )

    # Hide most labels
    plt.setp(ax.get_xticklines(), visible=False)
    plt.setp(ax.get_yticklines(), visible=False)
    plt.setp(ax.get_yticklabels(), visible=False)

    # Set the X labels
    ticks = np.arange(len(names) + 1) + 0.5
    plt.xticks(ticks, names + ["background"], rotation=30, ha="right")

    ax.set_ylim(0, len(motifs))

    # Title
    plt.title("Frequency")

    # Colorbar
    # pylint: disable=protected-access
    sm._A = []
    cax = plt.subplot(gs[0, plot_order[2]])
    cb = fig.colorbar(sm, cax=cax, ticks=[0, 0.3], orientation="horizontal")
    cb.ax.set_xticklabels(["0%", "30%"])

    # Enrichment plot
    ax = plt.subplot(gs[nbar:-3, plot_order[1]])
    vmin = -10
    vmax = 10
    ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax)
    for y, row in enumerate(enr[ind]):
        for x, val in enumerate(row):
            col = "black"
            if val >= (vmin + ((vmax - vmin) / 8.0 * 7)):
                col = "white"
            elif val <= (vmin + ((vmax - vmin) / 8.0)):
                col = "white"
            plt.text(
                x + 0.5,
                y + 0.5,
                "{:.1f}".format(val),
                ha="center",
                va="center",
                color=col,
            )

    ticks = np.arange(len(names)) + 0.5
    plt.xticks(ticks, names, rotation=30, ha="right")
    # plt.setp(plt.xticks()[1], rotation=30)
    # for label in labels:
    #    label.set_rotation(30)
    ticks = np.arange(len(motifs)) + 0.5
    plt.yticks(ticks, motifs[ind])
    plt.setp(ax.get_xticklines(), visible=False)
    plt.setp(ax.get_yticklines(), visible=False)

    ax.set_ylim(0, len(motifs))

    # Title
    plt.title("Enrichment (log2)")

    # Colorbar
    sm = plt.cm.ScalarMappable(cmap=c1, norm=Normalize(vmin=vmin, vmax=vmax))
    sm._A = []
    cax = plt.subplot(gs[0, plot_order[1]])
    cb = fig.colorbar(sm,
                      cax=cax,
                      ticks=[vmin, 0, vmax],
                      orientation="horizontal")
    cb.ax.set_xticklabels([vmin, 0, vmax])

    # Motif logos

    for i, motif in enumerate(motifs[ind][::-1]):
        ax = plt.subplot(gs[i + nbar, plot_order[0]])
        axes_off(ax)
        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png")
        pwms[motif].plot_logo(fname=tmp.name, title=False)
        ax.imshow(plt.imread(tmp.name), interpolation="none")

    # plt.show()
    plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close(fig)
Exemplo n.º 22
0
def diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, mindiff=0, minenr=3, minfreq=0.01):
    w_ratio = np.array([14, len(names), len(names) + 1])
    plot_order = [0,1,2]
    
    nbar = 5
    
    freq = np.array(freq)
    counts = np.array(counts)
    bgfreq = np.array([[x] for x in bgfreq])
    
    enr = np.log2(np.divide(freq, bgfreq))
   
    filt = np.ones(len(enr), dtype="bool")
    filters = [
                np.sum(enr > minenr, 1) > 0, 
                np.sum(freq > minfreq, 1) > 0,
                (np.max(enr, 1) - np.min(enr, 1)) > mindiff,
                np.sum(counts > 2, 1) > 0 
              ]
    for f in filters:
        filt = np.logical_and(filt, f)
         
        print "Filter: ", sum(filt)
    

    motifs = np.array(motifs)[filt]
    freq = freq[filt]
    bgfreq = bgfreq[filt]
    enr = enr[filt]
    
    for m,f,b,e in zip(motifs,freq,bgfreq,enr):
        sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format(m,f,b,e))
    
    
    if len(freq) == 0:
        sys.stderr.write("No enriched and/or differential motifs found.\n")
        return
    elif len(freq) >= 3:
        z = hier.linkage(freq, method="complete", metric="correlation")
        ind = hier.leaves_list(z)
    else:
        ind = np.arange(len(freq))
   
    fig = plt.figure(figsize=(
                (5 + 0.75 * len(names)) * 3,
                (0.3 * len(motifs) + 1.5) * 3
            ))
    
    gs = GridSpec(len(motifs) + 3 + nbar, 3,
                  height_ratios=[1] * nbar + [3] * (len(motifs) + 3),
                  width_ratios=w_ratio[plot_order],
                  )
    
    # Colormaps
    c1 = mpl.cm.RdBu
    c2 = mpl.cm.Blues ##create_colormap("white", "blue")
    
    ### Frequency plot ###
    
    # Create axis
    ax = plt.subplot(gs[nbar:-3, plot_order[2]])
    
    # Plot frequencies
    vmin = 0
    vmax = 0.3
    
    pfreq = np.hstack((freq, bgfreq))
    ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax)
    
    sm = plt.cm.ScalarMappable(cmap=c2, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax))
    
    # Show percentages
    for y,row in enumerate(pfreq[ind]):
        for x,val in enumerate(row):
            v = vmax
            if val >= (vmin + ((vmax - vmin) / 2)):
                v = vmin        
            plt.text(x + 0.5, y + 0.5, "{:.1%}".format(val), ha='center', va='center', color=sm.to_rgba(v))
    
    # Hide most labels
    plt.setp(ax.get_xticklines(),visible=False)
    plt.setp(ax.get_yticklines(),visible=False)
    plt.setp(ax.get_yticklabels(),visible=False)
    
    # Set the X labels
    ticks = np.arange(len(names)+ 1) + 0.5
    plt.xticks(ticks, names + ["background"], rotation=30, ha="right")

    ax.set_ylim(0, len(motifs))

    # Title
    plt.title('Frequency')
    
    # Colorbar
    sm._A = []
    cax = plt.subplot(gs[0,plot_order[2]])
    cb = fig.colorbar(sm, cax=cax, ticks = [0, 0.3], orientation='horizontal')
    cb.ax.set_xticklabels(["0%","30%"])
   

    #### Enrichment plot
    ax = plt.subplot(gs[nbar:-3, plot_order[1]])
    vmin = -10
    vmax = 10
    ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax)
    for y,row in enumerate(enr[ind]):
        for x,val in enumerate(row):
            col = "black"
            if val >= (vmin + ((vmax - vmin) / 8.0 * 7)):
                col = "white"
            elif val <= (vmin + ((vmax - vmin) / 8.0)):
                col = "white"
            plt.text(x + 0.5, y + 0.5, "{:.1f}".format(val), ha='center', va='center', color=col)
    
    ticks = np.arange(len(names)) + 0.5
    plt.xticks(ticks, names, rotation=30, ha="right")
    #plt.setp(plt.xticks()[1], rotation=30)
    #for label in labels: 
    #    label.set_rotation(30)
    ticks = np.arange(len(motifs)) + 0.5
    plt.yticks(ticks, motifs[ind])
    plt.setp(ax.get_xticklines(),visible=False)
    plt.setp(ax.get_yticklines(),visible=False)
    
    ax.set_ylim(0, len(motifs))
    
    # Title
    plt.title('Enrichment (log2)')
    
    # Colorbar
    sm = plt.cm.ScalarMappable(cmap=c1, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax))
    sm._A = []
    cax = plt.subplot(gs[0,plot_order[1]])
    cb = fig.colorbar(sm, cax=cax, ticks = [vmin,0, vmax], orientation='horizontal')
    cb.ax.set_xticklabels([vmin, 0, vmax])
   
   
    #### Motif logos
   
    for i,motif in enumerate(motifs[ind][::-1]):
        ax = plt.subplot(gs[i + nbar, plot_order[0]]) 
        axes_off(ax)
        tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png")
        pwms[motif].to_img(tmp.name, format="PNG", height=6)
        ax.imshow(plt.imread(tmp.name), interpolation="none")
    
    #plt.show()
    plt.savefig(outfile, dpi=300, bbox_inches='tight')
    plt.close(fig)
Exemplo n.º 23
0
    def to_img(self, fname, fmt="PNG", add_left=0, seqlogo=None, height=6):
        """Create a sequence logo using seqlogo.

        Create a sequence logo and save it to a file. Valid formats are: PNG, 
        EPS, GIF and PDF. 

        Parameters
        ----------
        fname : str
            Output filename.
        fmt : str , optional
            Output format (case-insensitive). Valid formats are PNG, EPS, GIF 
            and PDF.
        add_left : int , optional
            Pad motif with empty positions on the left side.
        seqlogo : str
            Location of the seqlogo executable. By default the seqlogo version 
            that is included with GimmeMotifs is used.
        height : float
            Height of the image
        """
        if not seqlogo:
            seqlogo = self.seqlogo
        if not seqlogo:
            raise ValueError("seqlogo not specified or configured")

        #TODO: split to_align function

        VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"]
        N = 1000
        fmt = fmt.upper()
        if not fmt in VALID_FORMATS:
            sys.stderr.write("Invalid motif format\n")
            return

        if fname[-4:].upper() == (".%s" % fmt):
            fname = fname[:-4]
        seqs = []
        if add_left == 0:
            seqs = ["" for i in range(N)]
        else:
            for nuc in ["A", "C", "T", "G"]:
                seqs += [nuc * add_left for i in range(N // 4)]

        for pos in range(len(self.pwm)):
            vals = [self.pwm[pos][0] * N]
            for i in range(1, 4):
                vals.append(vals[i - 1] + self.pwm[pos][i] * N)
            if vals[3] - N != 0:
                #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100)
                vals[3] = N
            for i in range(N):
                if i <= vals[0]:
                    seqs[i] += "A"
                elif i <= vals[1]:
                    seqs[i] += "C"
                elif i <= vals[2]:
                    seqs[i] += "G"
                elif i <= vals[3]:
                    seqs[i] += "T"

        f = NamedTemporaryFile(mode="w", dir=mytmpdir())
        for seq in seqs:
            f.write("%s\n" % seq)
        f.flush()
        makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y"
        cmd = makelogo.format(seqlogo, f.name, fmt, height,
                              len(self) + add_left, fname)
        sp.call(cmd, shell=True)