Exemplo n.º 1
0
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()
    
    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
        # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)
    
            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
            
            cmd = "gimme threshold {} {} {} > {}".format(
                    pwmfile,
                    bg_file,
                    FDR,
                    threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
Exemplo n.º 2
0
def scan_it_moods(infile,
                  motifs,
                  cutoff,
                  bgfile,
                  nreport=1,
                  scan_rc=True,
                  pvalue=None,
                  count=False):
    tmpdir = mkdtemp()
    matrices = []
    pseudocount = 1e-3
    # sys.stderr.write("bgfile: {}\n".format(bgfile))
    bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1)

    for motif in motifs:
        pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id))
        with open(pfmname, "w") as f:
            matrix = np.array(motif.pwm).transpose()
            for line in [" ".join([str(x) for x in row]) for row in matrix]:
                f.write("{}\n".format(line))

        matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount))

    thresholds = []
    if pvalue is not None:
        thresholds = [
            MOODS.tools.threshold_from_p(m, bg, float(pvalue))
            for m in matrices
        ]
        # sys.stderr.write("{}\n".format(thresholds))
    else:
        thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices]

    scanner = MOODS.scan.Scanner(7)
    scanner.set_motifs(matrices, bg, thresholds)

    config = MotifConfig()
    ncpus = int(config.get_default_params()["ncpus"])
    fa = Fasta(infile)
    chunk = 500
    if (len(fa) / chunk) < ncpus:
        chunk = len(fa) / (ncpus + 1)

    jobs = []
    func = scan_fa_with_motif_moods
    if count:
        func = scan_fa_with_motif_moods_count

    pool = mp.Pool()
    for i in range(0, len(fa), chunk):
        jobs.append(
            pool.apply_async(
                func,
                (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport,
                 scan_rc),
            ))

    for job in jobs:
        for ret in job.get():
            yield ret
Exemplo n.º 3
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)
    
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)
   
    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Exemplo n.º 4
0
def scan(infile, motifs, cutoff, nreport=1, it=False):
    # Get configuration defaults
    config = MotifConfig()
    # Cutoff for motif scanning, only used if a cutoff is not supplied
    default_cutoff = config.get_default_params()['scan_cutoff']
    # Number of CPUs to use
    ncpus =  config.get_default_params()['ncpus']
    
    cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) 
    
    job_server = pp.Server(secret="beetrootsoup")
    if job_server.get_ncpus() > ncpus:
        job_server.set_ncpus(ncpus)
    
    total_result = {}
    jobs = []
    fa = Fasta(infile)
    for motif in motifs:
        for i in range(0, len(fa), CHUNK):
            total_result[motif] = {}
            jobs.append(job_server.submit(
                                          scan_fa_with_motif,
                                          (fa[i:i + CHUNK],
                                          motif,
                                          cutoffs[motif.id],
                                          nreport,
                                          ),
                                          (),()))
    motifkey = dict([(m.id, m) for m in motifs])
    for job in jobs:
        motif, result = job()
        
        total_result[motifkey[motif.id]].update(result)
    
    return total_result
Exemplo n.º 5
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)

    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Exemplo n.º 6
0
def get_genome(genomebuild, fastadir, indexdir=None):

    config = MotifConfig()
    if not indexdir:
        indexdir = config.get_index_dir()

    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(indexdir, genomebuild)

    
    # Check for rights to write to directory
    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except OSError:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)

    # Download annotation
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    download_annotation(genomebuild, gene_file)
    
    # Download genome FASTA file
    download_genome(genomebuild, genome_dir)

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)
    create_bedtools_fa(index_dir, genome_dir)
Exemplo n.º 7
0
def pfmfile_location(infile):
    config = MotifConfig()

    if infile is None:
        infile = config.get_default_params().get("motif_db", None)
        if infile is None:
            raise ValueError(
                "No motif file was given and no default "
                "database specified in the config file."
            )

    if isinstance(infile, six.string_types):
        if not os.path.exists(infile):
            motif_dir = config.get_motif_dir()
            checkfile = os.path.join(motif_dir, infile)
            if os.path.exists(checkfile):
                infile = checkfile
            else:
                for ext in [".pfm", ".pwm"]:
                    if os.path.exists(checkfile + ext):
                        infile = checkfile + ext
                        break
            if not os.path.exists(infile):
                raise ValueError("Motif file {} not found".format(infile))

    return infile
Exemplo n.º 8
0
def scan(infile, motifs, cutoff, nreport=1, it=False):
    # Get configuration defaults
    config = MotifConfig()
    # Cutoff for motif scanning, only used if a cutoff is not supplied
    default_cutoff = config.get_default_params()['scan_cutoff']
    # Number of CPUs to use
    ncpus =  config.get_default_params()['ncpus']
    
    cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) 
    
    total_result = {}
    jobs = []
    fa = Fasta(infile)
    for motif in motifs:
        for i in range(0, len(fa), CHUNK):
            total_result[motif] = {}
            jobs.append(pool.apply_async(
                                          scan_fa_with_motif,
                                          (fa[i:i + CHUNK],
                                          motif,
                                          cutoffs[motif.id],
                                          nreport,
                                          )))
    motifkey = dict([(m.id, m) for m in motifs])
    for job in jobs:
        motif, result = job.get()
        
        total_result[motifkey[motif.id]].update(result)
   
    return total_result
Exemplo n.º 9
0
def scan_it(infile, motifs, cutoff, nreport=1, rc=True):
    # Get configuration defaults
    config = MotifConfig()
    # Cutoff for motif scanning, only used if a cutoff is not supplied
    default_cutoff = config.get_default_params()['scan_cutoff']
    # Number of CPUs to use
    ncpus =  config.get_default_params()['ncpus']
    
    cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) 
    
    jobs = []
    fa = Fasta(infile)
    motifkey = dict([(m.id, m) for m in motifs])
    
    for motif in motifs:
        for i in range(0, len(fa), CHUNK):
            jobs.append(pool.apply_async(
                                          scan_fa_with_motif,
                                          (fa[i:i + CHUNK],
                                          motif,
                                          cutoffs[motif.id],
                                          nreport,
                                          rc,
                                          )))
    
        while len(jobs) > 10:
            job = jobs.pop(0) 
            motif, result = job.get()
            yield motifkey[motif.id], result

    for job in jobs:
        motif, result = job.get()
        yield motifkey[motif.id], result
Exemplo n.º 10
0
def scan_it_moods(infile, motifs, cutoff, bgfile, nreport=1, scan_rc=True, pvalue=None, count=False):
    tmpdir = mkdtemp()
    matrices = []
    pseudocount = 1e-3
    #sys.stderr.write("bgfile: {}\n".format(bgfile))
    bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1)

    for motif in motifs:
        pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id))
        with open(pfmname, "w") as f:
            matrix = np.array(motif.pwm).transpose()
            for line in [" ".join([str(x) for x in row]) for row in matrix]:
                f.write("{}\n".format(line))

        matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount))

    thresholds = []
    if pvalue is not None:
        thresholds = [MOODS.tools.threshold_from_p(m, bg, float(pvalue)) for m in matrices]
        #sys.stderr.write("{}\n".format(thresholds))
    else:
        thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices]

    scanner = MOODS.scan.Scanner(7)
    scanner.set_motifs(matrices, bg, thresholds)

    config = MotifConfig()
    ncpus =  int(config.get_default_params()['ncpus'])
    fa = Fasta(infile)
    chunk = 500
    if (len(fa) / chunk) < ncpus:
        chunk = len(fa) / (ncpus + 1)

    jobs = []
    func = scan_fa_with_motif_moods
    if count:
        func = scan_fa_with_motif_moods_count

    for i in range(0, len(fa), chunk):
        jobs.append(pool.apply_async(
                                          func,
                                          (fa[i:i + chunk],
                                          motifs,
                                          matrices,
                                          bg,
                                          thresholds,
                                          nreport,
                                          scan_rc,
                                          )))

    for job in jobs:
        for ret in job.get():
            yield ret
Exemplo n.º 11
0
def default_motifs():
    """Return list of Motif instances from default motif database."""
    config = MotifConfig()
    d = config.get_motif_dir()
    m = config.get_default_params()['motif_db']

    if not d or not m:
        raise ValueError("default motif database not configured")

    fname = os.path.join(d, m)
    with open(fname) as f:
        motifs = read_motifs(f)

    return motifs
Exemplo n.º 12
0
def default_motifs():
    """Return list of Motif instances from default motif database."""
    config = MotifConfig()
    d = config.get_motif_dir()
    m = config.get_default_params()['motif_db']

    if not d or not m:
        raise ValueError("default motif database not configured")

    fname = os.path.join(d, m)
    with open(fname) as f:
        motifs = read_motifs(f)
    
    return motifs
Exemplo n.º 13
0
def scan_to_table(input_table,
                  genome,
                  data_dir,
                  scoring,
                  pwmfile=None,
                  ncpus=None):
    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR, genome=genome)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemplo n.º 14
0
class MotifProgram:
    from gimmemotifs.config import MotifConfig
    config = MotifConfig()

    def __init__(self):
        pass

    def bin(self):
        return self.config.bin(self.name)

    def dir(self):
        return self.config.dir(self.name)

    def is_configured(self):
        return self.config.is_configured(self.name)

    def is_installed(self):
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, savedir, params={}):
        if not self.is_configured():
            raise ValueError, "%s is not configured" % self.name

        if not self.is_installed():
            raise ValueError, "%s is not installed or not correctly configured" % self.name

        try:
            return self._run_program(self.bin(), fastafile, savedir, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Exemplo n.º 15
0
    def __init__(self, scale=True, ncpus=None):
        """Predict motif activities using Support Vector Regression.

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled
            before classification.

        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            SVR weights.
        """

        self.act_description = "activity values: SVR weights"

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.scale = scale
        self.act_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Exemplo n.º 16
0
    def get_all_scores(self,
                       motifs,
                       dbmotifs,
                       match,
                       metric,
                       combine,
                       pval=False,
                       parallel=True,
                       trim=None,
                       ncpus=None):

        # trim motifs first, if specified
        if trim:
            for m in motifs:
                m.trim(trim)
            for m in dbmotifs:
                m.trim(trim)

        # hash of result scores
        scores = {}

        if parallel:
            # Divide the job into big chunks, to keep parallel overhead to minimum
            # Number of chunks = number of processors available
            if ncpus is None:
                ncpus = int(MotifConfig().get_default_params()["ncpus"])

            pool = Pool(processes=ncpus, maxtasksperchild=1000)

            batch_len = len(dbmotifs) // ncpus
            if batch_len <= 0:
                batch_len = 1
            jobs = []
            for i in range(0, len(dbmotifs), batch_len):
                # submit jobs to the job server

                p = pool.apply_async(_get_all_scores,
                                     args=(self, motifs,
                                           dbmotifs[i:i + batch_len], match,
                                           metric, combine, pval))
                jobs.append(p)

            pool.close()
            for job in jobs:
                # Get the job result
                result = job.get()
                # and update the result score
                for m1, v in result.items():
                    for m2, s in v.items():
                        if m1 not in scores:
                            scores[m1] = {}
                        scores[m1][m2] = s

            pool.join()
        else:
            # Do the whole thing at once if we don't want parallel
            scores = _get_all_scores(self, motifs, dbmotifs, match, metric,
                                     combine, pval)

        return scores
Exemplo n.º 17
0
def create_roc_plots(pfmfile, fgfa, background, outdir, genome):
    """Make ROC plots for all motifs."""
    motifs = read_motifs(pfmfile, fmt="pwm", as_dict=True)
    ncpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=ncpus)
    jobs = {}
    for bg, fname in background.items():
        for m_id, m in motifs.items():

            k = "{}_{}".format(str(m), bg)
            jobs[k] = pool.apply_async(get_roc_values,
                                       (motifs[m_id], fgfa, fname, genome))
    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)

    roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png")

    for motif in motifs.values():
        for bg in background:
            k = "{}_{}".format(str(motif), bg)
            error, x, y = jobs[k].get()
            if error:
                logger.error("Error in thread: %s", error)
                logger.error("Motif: %s", motif)
                sys.exit(1)
            roc_plot(roc_img_file.format(motif.id, bg), x, y)
Exemplo n.º 18
0
    def __init__(self, name=None):
        self.config = MotifConfig()
        self.server = None

        if not name:
            name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y"))
        self.name = name

        # create a directory for all the intermediate and output files
        self._setup_output_dir(name)

        # setup logging
        self._setup_logging()
        self.logger.info("%s version %s", self.NAME, GM_VERSION)
        self.logger.info("output dir: %s", self.outdir)

        # setup the names of the intermediate and output files
        self._setup_filenames()
Exemplo n.º 19
0
    def __init__(self, scale=True, permute=False, ncpus=None):
        """Predict motif activities using lightning CDClassifier 

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
        
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.
       
        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        #self.cdc = CDClassifier(random_state=args.seed)
        self.cdc = CDClassifier()

        self.parameters = {
            "penalty": ["l1/l2"],
            "loss": ["squared_hinge"],
            "multiclass": [True],
            "max_iter": [20],
            "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
            "C": [0.001, 0.01, 0.1, 0.5, 1.0],
            "tol": [1e-3]
        }

        self.kfolds = 10

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))

        self.clf = GridSearchCV(self.cdc,
                                self.parameters,
                                cv=self.kfolds,
                                n_jobs=ncpus)

        self.scale = scale
        self.permute = permute

        self.act_ = None
        self.sig_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "classification"
Exemplo n.º 20
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")

    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)

    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()

    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile,
            infile.replace(".bed", ".fa"),
        )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2

    genome.track2fasta(
        val_bedfile,
        os.path.join(outdir, "localization.fa"),
        extend_up=extend,
        extend_down=extend,
        stranded=params["use_strand"],
    )
Exemplo n.º 21
0
    def __init__(self, ncpus=None):
        self.config = MotifConfig()
        self.threshold = None
        self.genome = None

        if ncpus is None:
            self.ncpus = int(MotifConfig().get_default_params()["ncpus"])
        else:
            self.ncpus = ncpus

        if self.ncpus > 1:
            try:
                ctx = mp.get_context('spawn')
                self.pool = ctx.Pool(processes=self.ncpus)
            except AttributeError:
                self.pool = mp.Pool(processes=self.ncpus)

        self.use_cache = False
        if self.config.get_default_params().get("use_cache", False):
            self._init_cache()
Exemplo n.º 22
0
    def __init__(self, matchfile, genome="hg19", number=None):
        config = MotifConfig()
        index = os.path.join(config.get_index_dir(), genome)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Exemplo n.º 23
0
def _write_report(outdir, ids, tree, clusters):
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result)

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for motif_id in ids:
        f.write("%s\t%s\n" % (motif_id[0], ",".join([x["alt"] for x in motif_id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
Exemplo n.º 24
0
    def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
        """Predict motif activities using Lasso MultiTask regression

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled
            before classification

        kfolds : integer, optional, default 5
            number of kfolds for parameter search

        alpha_stepsize : float, optional, default 1.0
            stepsize for use in alpha gridsearch

        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted motif activities

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.kfolds = kfolds
        self.act_description = "activity values: coefficients from " "fitted model"

        self.scale = scale
        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus

        # initialize attributes
        self.act_ = None
        self.sig_ = None

        mtk = MultiTaskLasso()
        parameters = {
            "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]
        }
        self.clf = GridSearchCV(mtk,
                                parameters,
                                cv=kfolds,
                                n_jobs=self.ncpus,
                                scoring="r2")
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Exemplo n.º 25
0
def scan_it(infile, motifs, cutoff, nreport=1, rc=True):
    # Get configuration defaults
    config = MotifConfig()
    # Cutoff for motif scanning, only used if a cutoff is not supplied
    default_cutoff = config.get_default_params()['scan_cutoff']
    # Number of CPUs to use
    ncpus =  config.get_default_params()['ncpus']
    
    cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) 
    
    job_server = pp.Server(secret="beetrootsoup")
    pp.SHOW_EXPECTED_EXCEPTIONS # True
    if job_server.get_ncpus() > ncpus:
        job_server.set_ncpus(ncpus)
    
    jobs = []
    fa = Fasta(infile)
    motifkey = dict([(m.id, m) for m in motifs])
    
    for motif in motifs:
        for i in range(0, len(fa), CHUNK):
            jobs.append(job_server.submit(
                                          scan_fa_with_motif,
                                          (fa[i:i + CHUNK],
                                          motif,
                                          cutoffs[motif.id],
                                          nreport,
                                          rc,
                                          ),
                                          (),()))
    
        while len(jobs) > 10:
            job = jobs.pop(0) 
            motif, result = job()
            yield motifkey[motif.id], result

    for job in jobs:
        motif, result = job()
        yield motifkey[motif.id], result
Exemplo n.º 26
0
    def __init__(self, ncpus=None):
        self.config = MotifConfig()
        self.threshold = None
        self.genome = None
        self.background = None
        self.meanstd = {}
        self.gc_bins = [(0, 1)]

        if ncpus is None:
            self.ncpus = int(MotifConfig().get_default_params()["ncpus"])
        else:
            self.ncpus = ncpus

        if self.ncpus > 1:
            # try:
            #    ctx = mp.get_context('spawn')
            #    self.pool = ctx.Pool(processes=self.ncpus)
            # except AttributeError:
            self.pool = mp.Pool(processes=self.ncpus)

        self.use_cache = False
        if self.config.get_default_params().get("use_cache", False):
            self._init_cache()
Exemplo n.º 27
0
def pwmfile_location(infile):
    config = MotifConfig()

    if infile is None:
        infile = config.get_default_params().get("motif_db", None)
        if infile is None:
            raise ValueError("No motif file was given and no default "
                    "database specified in the config file.")

    if isinstance(infile, six.string_types):
        if not os.path.exists(infile):
            motif_dir = config.get_motif_dir()
            checkfile = os.path.join(motif_dir, infile)
            if os.path.exists(checkfile):
                infile = checkfile
            else:
                for ext in ['.pfm', '.pwm']:
                    if os.path.exists(checkfile + ext):
                        infile = checkfile + ext
                    break
            if not os.path.exists(infile):
                raise ValueError("Motif file {} not found".format(infile))

    return infile
Exemplo n.º 28
0
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()

    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir,
                                      "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
            # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)

            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

            cmd = "gimme threshold {} {} {} > {}".format(
                pwmfile, bg_file, FDR, threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
Exemplo n.º 29
0
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2):
    df = pd.read_table(infile, index_col=0)
    df = df[np.any(abs(df) >= threshold, 1)]
    M = max(abs(df.min().min()), df.max().max())
    m = -M

    if pwmfile:
        with open(pwmfile) as f:
            motifs = read_motifs(f)
    else:
        motifs = default_motifs()

    del df.index.name
    cols = df.columns
    m2f = dict([(m.id,",".join(m.factors)) for m in motifs])

    df["factors"] = [m2f.get(m, "") for m in df.index]
    f = df["factors"].str.len() > 30
    df["factors"] = '<div title="' + df["factors"] + '">' + df["factors"].str.slice(0,30)
    df.loc[f, "factors"] += '(...)'
    df['factors'] += '</div>'

    df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index)]

    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in motifs:
        if motif.id in df.index:
            motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG")

    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read()
    css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read()
    cm = sns.diverging_palette(240, 10, as_cmap=True)
    df = df[["factors", "logo"] + list(cols)]
    with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")

        f.write(df.style.apply(background_gradient, low=0.7, high=0.7, m=m, M=M, subset=cols).set_precision(3).set_table_attributes("data-sortable").render().replace("data-sortable", 'class="sortable-theme-slick" data-sortable'))

        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
Exemplo n.º 30
0
    def __init__(self, name=None):
        self.config = MotifConfig()
        self.server = None

        if not name:
            name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y"))
        self.name = name

        # create a directory for all the intermediate and output files
        self._setup_output_dir(name)

        # setup logging
        self._setup_logging()
        self.logger.info("%s version %s", self.NAME, GM_VERSION)
        self.logger.info("output dir: %s", self.outdir)

        # setup the names of the intermediate and output files
        self._setup_filenames()
Exemplo n.º 31
0
    def __init__(self, scale=True, cv=3, ncpus=None):
        """Predict motif activities using lightning CDRegressor 

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
       
        cv : int, optional, default 3
            Cross-validation k-fold parameter.
        
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.kfolds = cv
        self.scale = scale

        self.act_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Exemplo n.º 32
0
    def __init__(self, ncpus=None):
        """Predict motif activities using a random forest classifier

        Parameters
        ----------
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            feature importances from the model

        """
        self.act_ = None
        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.act_description = ("activity values: feature importances "
                                "from fitted Random Forest model")
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "classification"
Exemplo n.º 33
0
class MotifProgram(object):
    config = MotifConfig()
    local_bin = None

    def __init__(self):
        pass

    def bin(self):
        if self.local_bin:
            return self.local_bin
        else:
            return self.config.bin(self.name)

    def dir(self):
        return self.config.dir(self.name)

    def is_configured(self):
        return self.config.is_configured(self.name)

    def is_installed(self):
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, savedir, params=None, tmp=None):

        if not self.is_configured():
            raise ValueError("%s is not configured" % self.name)

        if not self.is_installed():
            raise ValueError(
                "%s is not installed or not correctly configured" % self.name)

        self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp)

        try:
            return self._run_program(self.bin(), fastafile, savedir, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Exemplo n.º 34
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")
    
    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000) 
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
Exemplo n.º 35
0
def visualize_maelstrom(outdir, sig_cutoff=3, pwmfile=None):
    
    config = MotifConfig()
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
    
    mapfile = pwmfile.replace(".pwm", ".motif2factors.txt")
    if os.path.exists(mapfile):
    
        m2f = pd.read_csv(mapfile, sep="\t", names=["motif","factors"], index_col=0) 
        m2f["factors"] = m2f["factors"].str[:50]
    else:
        motifs = [m.id for m in read_motifs(open(pwmfile))]
        m2f = pd.DataFrame({"factors": motifs}, index=motifs)

    sig_fname = os.path.join(outdir, "final.out.csv")
    df_sig = pd.read_table(sig_fname, index_col=0)
    f = np.any(df_sig >= sig_cutoff, 1)
    vis = df_sig[f]
    if vis.shape[0] == 0:
        sys.stderr.write("No motifs reach the threshold, skipping visualization.\n")
        return
    
    # cluster rows
    row_linkage = hierarchy.linkage(
        distance.pdist(vis, metric="euclidean"), 
        method='complete')
    idx = hierarchy.leaves_list(row_linkage)
    
    plt.figure(figsize=size)
    
    vis = safe_join(vis, m2f).set_index("factors")
    
    # size of figure
    size = [2 + vis.shape[1] * 0.4, 1.8 + vis.shape[0] * 0.3]
    
    cg = sns.heatmap(vis.iloc[idx], cmap="viridis", 
                        yticklabels=True, 
                       cbar_kws={"orientation":"horizontal"})
    _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
    plt.title("Motif Relevance")
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "motif.relevance.png"), dpi=300) 
   
    freq_fname = os.path.join(outdir, "motif.freq.txt")
    if os.path.exists(freq_fname):
        df_freq = pd.read_table(freq_fname, index_col=0) 
        df_freq = df_freq.T
        vis_freq = df_freq.loc[vis.iloc[idx].index]
        vis_freq = safe_join(vis_freq, m2f).set_index("factors")
        plt.figure(figsize=size)
        cg = sns.heatmap(vis_freq, cmap="viridis", 
                         yticklabels=True, vmin=0, vmax=0.2,
                           cbar_kws={"orientation":"horizontal"})
        #idx = cg.dendrogram_row.reordered_ind
        _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
        plt.title("Motif Frequency")
        plt.tight_layout()
        plt.savefig(os.path.join(outdir, "motif.frequency.png"), dpi=300) 
        
        plt.figure(figsize=size)
        
        bla = vis_freq.min(1)
        bla[bla < 0.01] = 0.01
        
        cg = sns.heatmap(np.log2(vis_freq.apply(lambda x: x / bla, 0)), 
                         yticklabels=True, vmin=-5, vmax=5,
                        cbar_kws={"orientation":"horizontal"})
        #idx = cg.dendrogram_row.reordered_ind
        _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
        plt.title("Motif Enrichment")
        plt.tight_layout()
        plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300) 
Exemplo n.º 36
0
class GimmeMotifs(object):
    NAME = "gimme_motifs"
    SCAN_THRESHOLD = "0.9"

    def __init__(self, name=None):
        self.config = MotifConfig()
        self.server = None

        if not name:
            name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y"))
        self.name = name

        # create a directory for all the intermediate and output files
        self._setup_output_dir(name)

        # setup logging
        self._setup_logging()
        self.logger.info("%s version %s", self.NAME, GM_VERSION)
        self.logger.info("output dir: %s", self.outdir)

        # setup the names of the intermediate and output files
        self._setup_filenames()

    def job_server(self):
        try:
            self.server.submit(job_server_ok)
        except Exception:
            self.server = self._get_job_server()
        return self.server

    def _setup_output_dir(self, name):

        if os.path.exists(name):
            sys.stderr.write(
                "Output directory {} already exists!\n".format(name))
            sys.stderr.write(
                "Resuming a previous run is not yet implemented. Please specify a different name,\n"
            )
            sys.stderr.write(
                "or delete this directory if you really want to overwrite it\n"
            )
            #sys.exit(1)
        else:
            try:
                os.makedirs(name)
            except OSError:
                sys.stderr.write(
                    "Can't create output directory {}!\n".format(name))
                #sys.exit(1)

        self.outdir = name
        self.tmpdir = os.path.join(self.outdir, "intermediate_results")
        self.imgdir = os.path.join(self.outdir, "images")
        try:
            os.mkdir(self.tmpdir)
            os.mkdir(self.imgdir)
        except OSError:
            pass
        star_img = os.path.join(self.config.get_template_dir(), "star.png")
        shutil.copyfile(star_img, os.path.join(self.imgdir, "star.png"))

    def _setup_logging(self):
        self.logger = logging.getLogger('motif_analysis')
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = 0

        # nice format
        file_formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        screen_formatter = logging.Formatter(
            "%(asctime)s - %(levelname)s - %(message)s")

        # Log to file
        logfile = os.path.join(self.name, "%s.log" % self.NAME)
        fh = logging.FileHandler(logfile, "w")
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(file_formatter)
        self.logger.addHandler(fh)

        # Log to screen
        sh = logging.StreamHandler(sys.stdout)
        sh.setLevel(logging.INFO)
        sh.setFormatter(screen_formatter)
        self.logger.addHandler(sh)

        self.logger.debug("Logging started")
        self.logger.info("log: %s", logfile)

    def _setup_filenames(self):
        basename = os.path.split(self.name)[-1]
        self.basename = basename

        self.logger.debug("basename: {}".format(basename))
        # Um yes, there is a smarter way, I'm sure! ;)
        self.input_bed = os.path.join(self.tmpdir,
                                      "%s_peakinputfile.bed" % basename)

        self.prediction_bed = os.path.join(self.tmpdir,
                                           "%s_prediction.bed" % basename)
        self.prediction_fa = os.path.join(self.tmpdir,
                                          "%s_prediction.fa" % basename)
        self.prediction_bg = os.path.join(
            self.tmpdir, "%s_prediction_background.fa" % basename)

        self.validation_bed = os.path.join(self.tmpdir,
                                           "%s_validation.bed" % basename)
        self.validation_fa = os.path.join(self.tmpdir,
                                          "%s_validation.fa" % basename)
        self.validation_gff = os.path.join(self.tmpdir,
                                           "%s_validation.gff" % basename)

        self.predicted_pfm = os.path.join(self.tmpdir,
                                          "%s_all_motifs.pfm" % basename)

        self.significant_pfm = os.path.join(
            self.tmpdir, "%s_significant_motifs.pfm" % basename)

        self.location_fa = os.path.join(self.tmpdir,
                                        "%s_validation_500.fa" % basename)
        self.location_pfile = os.path.join(
            self.tmpdir, "%s_localization_pvalue.txt" % basename)
        self.stats_file = os.path.join(self.tmpdir, "%s_stats.txt" % basename)
        self.ranks_file = os.path.join(self.tmpdir, "%s_ranks.txt" % basename)

        #self.cluster_dir = os.path.join(self.outdir, "cluster_report")
        self.validation_cluster_gff = os.path.join(
            self.tmpdir, "%s_validation_clustered.gff" % basename)
        self.cluster_pwm = os.path.join(self.tmpdir,
                                        "%s_clustered_motifs.pwm" % basename)
        self.final_pwm = os.path.join(self.outdir, "%s_motifs.pwm" % basename)
        self.cluster_report = os.path.join(self.outdir,
                                           "%s_cluster_report.html" % basename)
        self.motif_report = os.path.join(self.outdir,
                                         "%s_motif_report.html" % basename)
        self.text_report = os.path.join(self.outdir,
                                        "%s_motif_report.tsv" % basename)
        self.params_file = os.path.join(self.outdir,
                                        "%s_params.txt" % basename)

        # Data structures to hold the background file locations
        ftypes = {
            "bed": ".bed",
            "fa": ".fa",
            "gff": ".gff",
            "enrichment": "_enrichment.txt",
            "roc": "_significant_motifs_roc_metrics.txt",
            "cluster_gff": "_clustered.gff",
            "cluster_enrichment": "_enrichment_clustered.txt",
            "cluster_roc": "_roc_metrics_clustered.txt"
        }

        self.bg_file = dict([(t, {}) for t in ftypes.keys()])

        for bg in (FA_VALID_BGS + BED_VALID_BGS):
            for ftype, extension in ftypes.items():
                self.bg_file[ftype][bg] = os.path.join(
                    self.tmpdir, "%s_bg_%s%s" % (basename, bg, extension))

    def _is_parallel_enabled(self):
        return True

    def _get_job_server(self):
        return pool

    def _check_input(self, fname):
        """ Check if the inputfile is a valid bed-file """
        if not os.path.exists(fname):
            self.logger.error("Inputfile %s does not exist!", fname)
            sys.exit(1)

        for i, line in enumerate(open(fname)):
            if line.startswith("#") or line.startswith(
                    "track") or line.startswith("browser"):
                # comment or BED specific stuff
                pass
            else:
                vals = line.strip().split("\t")
                if len(vals) < 3:
                    self.logger.error(
                        "Expecting tab-seperated values (chromosome<tab>start<tab>end) on line %s of file %s",
                        i + 1, fname)
                    sys.exit(1)
                try:
                    start, end = int(vals[1]), int(vals[2])
                except ValueError:
                    self.logger.error(
                        "No valid integer coordinates on line %s of file %s",
                        i + 1, fname)
                    sys.exit(1)
                if len(vals) > 3:
                    try:
                        float(vals[3])
                    except ValueError:
                        pass
                        #self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file))

    def prepare_input_bed(self,
                          inputfile,
                          organism="hg18",
                          width=200,
                          fraction=0.2,
                          abs_max=1000,
                          use_strand=False):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)
        use_strand = bool(use_strand)

        self.logger.info("preparing input (BED)")

        # Set all peaks to specific width
        self.logger.debug("Creating inputfile %s, width %s", self.input_bed,
                          width)

        #    if not self.weird:
        write_equalwidth_bedfile(inputfile, width, self.input_bed)

        # Split input_bed in prediction and validation set
        self.logger.debug(
            "Splitting %s into prediction set (%s) and validation set (%s)",
            self.input_bed, self.prediction_bed, self.validation_bed)
        #if not self.weird:
        self.prediction_num, self.validation_num = divide_file(
            self.input_bed, self.prediction_bed, self.validation_bed, fraction,
            abs_max)

        # Make fasta files
        index_dir = os.path.join(self.config.get_index_dir(), organism)
        self.logger.debug("Creating %s", self.prediction_fa)

        genome_index.track2fasta(index_dir,
                                 self.prediction_bed,
                                 self.prediction_fa,
                                 use_strand=use_strand,
                                 ignore_missing=True)
        self.logger.debug("Creating %s", self.validation_fa)
        genome_index.track2fasta(index_dir,
                                 self.validation_bed,
                                 self.validation_fa,
                                 use_strand=use_strand,
                                 ignore_missing=True)

    def prepare_input_fa(self,
                         inputfile,
                         width=200,
                         fraction=0.2,
                         abs_max=1000):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)

        self.logger.info("preparing input (FASTA)")

        # Split inputfile in prediction and validation set
        self.logger.debug(
            "Splitting %s into prediction set (%s) and validation set (%s)",
            self.inputfile, self.prediction_fa, self.validation_fa)

        self.prediction_num, self.validation_num = divide_fa_file(
            self.inputfile, self.prediction_fa, self.validation_fa, fraction,
            abs_max)

    def _create_background(self,
                           bg_type,
                           bedfile,
                           fafile,
                           outfile,
                           organism="hg18",
                           width=200,
                           nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn(
                    "Are you sure about the Markov model? It seems too high!")
            else:
                order = {
                    "1": "1st",
                    "2": "2nd",
                    "3": "3rd",
                    "4": "4th",
                    "5": "5th"
                }[str(self.markov_model)]
                self.logger.debug(
                    "Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(),
                                     "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                "Creating random promoter background (%s, using genes in %s)",
                organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                    "User-specified background file %s does not exist!",
                    bg_file)
                sys.exit(1)
            else:
                self.logger.info(
                    "Copying user-specified background file %s to %s.",
                    bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn(
                        "The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.",
                        bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)


#    def filter_motifs(self, motif_ids, enrichmentfile, e_cutoff, p_cutoff):
#        filt_motifs = []
#        for line in open(enrichmentfile).readlines():
#            if not line.startswith("#"):
#                vals = line.strip().split("\t")
#                if vals[0] in motif_ids:
#                    p,e = float(vals[2]), float(vals[5])
#                    if p <= p_cutoff and e >= e_cutoff:
#                        filt_motifs.append(vals[0])
#        return filt_motifs

    def calculate_enrichment(self, motif_file, fg, bg):
        """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """

        self.logger.debug("Scanning background sequences with motifs")

        # define filenames
        fnames = [(fg[0], fg[1])] + [x[:2] for x in bg]
        # scan and save as gff
        for infile, outfile in fnames:
            with open(outfile, "w") as f:
                for line in command_scan(infile,
                                         motif_file,
                                         nreport=1,
                                         cutoff=self.SCAN_THRESHOLD,
                                         bed=False,
                                         scan_rc=True):
                    f.write(line + "\n")

        self.logger.debug("Calculating enrichment")
        enrichment_cmd = gff_enrichment
        num_sample = len(Fasta(fg[0]).items())
        for fasta_file, gff_file, out_file in bg:
            num_bg = len(Fasta(fasta_file).items())
            enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)

    def create_background(self, background=None, organism="hg18", width=200):
        if background is None:
            background = ["random"]

        nr_sequences = {}

        # Create background for motif prediction
        if "gc" in background:
            self._create_background("gc",
                                    self.validation_bed,
                                    self.validation_fa,
                                    self.prediction_bg,
                                    organism=organism,
                                    width=width)
        else:
            self._create_background(background[0],
                                    self.validation_bed,
                                    self.validation_fa,
                                    self.prediction_bg,
                                    organism=organism,
                                    width=width)

        # Get background fasta files
        for bg in background:
            nr_sequences[bg] = self._create_background(bg,
                                                       self.validation_bed,
                                                       self.validation_fa,
                                                       self.bg_file["fa"][bg],
                                                       organism=organism,
                                                       width=width)

    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(pfm_file,
                                  "total",
                                  "wic",
                                  "mean",
                                  True,
                                  threshold=float(threshold),
                                  include_bg=True,
                                  progress=False)
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster, members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id),
                           format="PNG")
            ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] = mc.compare_motifs(cluster,
                                                      motif,
                                                      "total",
                                                      "wic",
                                                      "mean",
                                                      pval=True)
                add_pos = sorted(scores.values(),
                                 cmp=lambda x, y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1, "+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(
                        self.imgdir, "%s.png" % motif.id.replace(" ", "_")),
                                 format="PNG",
                                 add_left=add)
            ids[-1][2] = [
                dict([("src", "images/%s.png" % motif.id.replace(" ", "_")),
                      ("alt", motif.id.replace(" ", "_"))])
                for motif in members
            ]

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=ids,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s",
                          self.cluster_report)
        return clusters

    def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
        motifs = dict([(m.id, m)
                       for m in read_motifs(open(pwm_file), fmt="pwm")])

        jobs = {}
        for id, m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_roc_values, (
                motifs[id],
                fg_fasta,
                bg_fasta,
            ))

        roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png")

        for id in motifs.keys():
            error, x, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)

            roc_plot(roc_img_file % (id, name), x, y)

    def calculate_cluster_enrichment(self, pwm, background):
        fg = [self.validation_fa, self.validation_cluster_gff]
        bg = [[
            self.bg_file["fa"][bg_id], self.bg_file["gff"][bg_id],
            self.bg_file["cluster_enrichment"][bg_id]
        ] for bg_id in background]
        self.calculate_enrichment(pwm, fg, bg)
        pass

    def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file):
        motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")])

        jobs = {}
        for id, m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_scores, (
                motifs[id],
                sample_fa,
                bg_fa,
            ))

        all_auc = {}
        all_mncp = {}
        f = open(roc_file, "w")
        f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n")
        for id in motifs.keys():
            error, auc, mncp, max_f, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)
            f.write("%s\t%s\t%s\t%s\t%s\n" % (id, auc, mncp, max_f, y))
            all_auc[id] = auc
            all_mncp[id] = mncp

        f.close()

        return all_auc, all_mncp

    def _calc_report_values(self, pwm, background):
        self.logger.debug("Calculating final statistics for report")
        self.p = dict([(b, {}) for b in background])
        self.e = dict([(b, {}) for b in background])

        e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg])
                        for bg in background])

        for bg in self.p.keys():
            for line in open(e_files[bg]).readlines():
                if not (line.startswith("#") or line.startswith("Motif\tSig")):
                    vals = line.strip().split("\t")
                    self.p[bg][vals[0]] = float(vals[2])
                    self.e[bg][vals[0]] = float(vals[5])

        self.auc = dict([(b, {}) for b in background])
        self.mncp = dict([(b, {}) for b in background])

        rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]])
                     for bg in background])

        for bg in self.auc.keys():
            bg_fasta_file, roc_file = rocs[bg]
            self.auc[bg], self.mncp[bg] = self._roc_metrics(
                pwm, self.validation_fa, bg_fasta_file, roc_file)

        motifs = read_motifs(open(pwm), fmt="pwm")
        self.closest_match = self.determine_closest_match(motifs)

    def _create_text_report(self, pwm, background):
        self.logger.debug("Creating text report")
        motifs = read_motifs(open(pwm), fmt="pwm")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        f = open(self.text_report, "w")
        header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join(
            "Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" %
            (b, b, b, b) for b in background)
        #print header
        f.write("%s\n" % header)
        for motif in sorted(motifs,
                            cmp=lambda x, y: cmp(self.mncp[sort_key][y.id],
                                                 self.mncp[sort_key][x.id])):
            vals = [
                motif.id,
                motif.to_consensus(), self.closest_match[motif.id][0].id,
                self.closest_match[motif.id][1]
            ]
            for bg in background:
                vals += [
                    self.e[bg][motif.id], self.p[bg][motif.id],
                    self.auc[bg][motif.id], self.mncp[bg][motif.id]
                ]
            f.write("%s\n" % "\t".join([str(x) for x in vals]))
            #print "%s\n" % "\t".join([str(x) for x in vals])
        f.close()

    def print_params(self):
        f = open(self.params_file, "w")
        for param, value in self.params.items():
            f.write("%s\t%s\n" % (param, value))
        f.close()

    def _create_report(self, pwm, background, stats=None, best_id=None):
        if stats is None:
            stats = {}
        if best_id is None:
            best_id = {}

        self.logger.debug("Creating graphical report")

        class ReportMotif:
            pass

        motifs = read_motifs(open(pwm), fmt="pwm")
        for m, match in self.closest_match.items():
            match[0].to_img(os.path.join(self.imgdir, "%s.png" % match[0].id),
                            format="PNG")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        roc_img_file = "%s_%s_roc"
        report_motifs = []
        sorted_motifs = sorted(motifs,
                               cmp=lambda x, y: cmp(self.mncp[sort_key][y.id],
                                                    self.mncp[sort_key][x.id]))

        for motif in sorted_motifs:
            rm = ReportMotif()
            rm.id = motif.id
            rm.id_href = {"href": "#%s" % motif.id}
            rm.id_name = {"name": motif.id}
            rm.img = {"src": os.path.join("images", "%s.png" % motif.id)}

            rm.best = best_id[motif.id]

            rm.consensus = motif.to_consensus()
            rm.stars = stats["%s_%s" %
                             (motif.id, motif.to_consensus())]["stars"]

            rm.bg = {}
            for bg in background:
                rm.bg[bg] = {}
                rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0)
                rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0)
                rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id]
                rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id]
                rm.bg[bg]["roc_img"] = {
                    "src":
                    "images/" + os.path.basename(roc_img_file %
                                                 (motif.id, bg)) + ".png"
                }
                rm.bg[bg]["roc_img_link"] = {
                    "href":
                    "images/" + os.path.basename(roc_img_file %
                                                 (motif.id, bg)) + ".png"
                }

            rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id}
            rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id}
            rm.match_img = {
                "src": "images/%s.png" % self.closest_match[motif.id][0].id
            }
            rm.match_id = self.closest_match[motif.id][0].id
            rm.match_pval = "%0.2e" % self.closest_match[motif.id][1]

            report_motifs.append(rm)

        total_report = self.motif_report

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("report_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=report_motifs,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(total_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs,
                                     db_motifs,
                                     "partial",
                                     "wic",
                                     "mean",
                                     parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(
                motif,
                db_motif_lookup[match[motif.id][0]],
                "partial",
                "wic",
                "mean",
                pval=True)
            closest_match[motif.id] = [
                db_motif_lookup[match[motif.id][0]], pval
            ]
        return closest_match

    def _determine_best_motif_in_cluster(self,
                                         clusters,
                                         pwm,
                                         sample_fa,
                                         bg_fa,
                                         imgdir=None):
        num_cluster = {}
        best_id = {}
        out = open(pwm, "w")
        for i, (clus, singles) in enumerate(clusters):
            best_motif = clus
            if len(singles) > 1:
                motifs = [clus] + singles
                tmp = NamedTemporaryFile(dir=mytmpdir())
                tmp2 = NamedTemporaryFile(dir=mytmpdir())
                for m in motifs:
                    tmp.write("%s\n" % m.to_pwm())
                tmp.flush()
                auc, mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa,
                                              tmp2.name)
                bla = sorted(motifs,
                             cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))
                for m in bla:
                    self.logger.debug("sorted: %s %s %s", str(m), mncp[m.id],
                                      auc[m.id])

                self.logger.debug("end list")

                best_motif = sorted(
                    motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))[-1]
                tmp.close()
                tmp2.close()
            old_id = best_motif.id
            best_motif.id = "GimmeMotifs_%d" % (i + 1)
            best_id[best_motif.id] = old_id.split("_")[0]
            num_cluster["%s_%s" % (best_motif.id,
                                   best_motif.to_consensus())] = len(singles)
            if imgdir:
                best_motif.to_img(os.path.join(imgdir, best_motif.id),
                                  format="PNG")
            out.write("%s\n" % best_motif.to_pwm())
        out.close()
        return num_cluster, best_id

    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("starting full motif analysis")
        self.logger.debug("Using temporary directory {0}".format(mytmpdir()))

        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)

        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.debug("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.debug("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]

        self.logger.debug("Parameters:")
        for param, value in params.items():
            self.logger.debug("  %s: %s", param, value)

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.debug("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except Exception:
            # Leave it to BED
            pass

        index_msg = ("No index found for genome {}! "
                     "Has GimmeMotifs been configured correctly and is the "
                     "genome indexed?").format(params["genome"])
        index_dir = os.path.join(self.config.get_index_dir(), params["genome"])

        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info(
                        "Input type is FASTA, can't use background type '%s'",
                        bg)
                if bg == "genomic":
                    if not os.path.exists(index_dir):
                        self.logger.error(index_msg)
                        sys.exit(1)
            background = [bg for bg in background if bg in FA_VALID_BGS]

        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            if not os.path.exists(index_dir):
                self.logger.error(index_msg)
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)  # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info(
                        "Input type is BED, can't use background type '%s'",
                        bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]

        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except Exception:
                self.logger.debug(
                    "Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.debug(
                    "Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.debug(
                    "Invalid time limit for motif prediction, setting to no limit"
                )
                self.max_time = None
        else:
            self.logger.debug("No time limit for motif prediction")

        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"],
                                   params["width"], params["fraction"],
                                   params["abs_max"], params["use_strand"])

            # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(),
                                     params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir,
                                     self.validation_bed,
                                     self.location_fa,
                                     extend_up=extend,
                                     extend_down=extend,
                                     use_strand=params["use_strand"],
                                     ignore_missing=True)

        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"],
                                  params["fraction"], params["abs_max"])

            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0])
            all_same_width = not (False
                                  in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn(
                    "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!"
                )

        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x
                       in [y.strip() for y in params["tools"].split(",")])
                      for x in params["available_tools"].split(",")])

        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("starting motif prediction (%s)", analysis)
        self.logger.info("tools: %s",
                         ", ".join([x for x in tools.keys() if tools[x]]))

        bg_file = self.bg_file["fa"][sorted(
            background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.debug("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa,
                                   self.predicted_pfm,
                                   analysis,
                                   params["genome"],
                                   params["use_strand"],
                                   self.prediction_bg,
                                   tools,
                                   self.job_server(),
                                   logger=self.logger,
                                   max_time=self.max_time,
                                   fg_file=self.validation_fa,
                                   bg_file=bg_file)

        motifs = result.motifs
        self.logger.info("predicted %s motifs", len(motifs))
        self.logger.debug("written to %s", self.predicted_pfm)

        if len(motifs) == 0:
            self.logger.info("no motifs found")
            sys.exit()

        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))

        self.logger.debug(result.stats)

        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write(
                    "%s\t%s\n" %
                    (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error(
                    "No stats for motif {0}, skipping this motif!".format(
                        motif.id))
                motifs.remove(motif)
        f.close()

        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0], 1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" %
                                   (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]

            f.write("%s\t%s\t%s\n" %
                    (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" %
                    (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()

        #self.logger.debug("RANK: %s" % stat)
        #self.logger.debug("\t".join([str(x) for x in names]))
        #self.logger.debug("\t".join([str(x) for x in vals]))
        #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[
                    'enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()
        self.logger.info("%s motifs are significant", nsig)
        self.logger.debug("written to %s", self.significant_pfm)

        if nsig == 0:
            self.logger.info("no significant motifs found")
            return

        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa,
                              self.bg_file["fa"][bg], self.bg_file["roc"][bg])

        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm,
                                        self.outdir,
                                        params["cluster_threshold"])

        # Determine best motif in cluster

        num_cluster, best_id = self._determine_best_motif_in_cluster(
            clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)

        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -

        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp,
                             logger=self.logger,
                             job_server=self.server,
                             fg_file=self.validation_fa,
                             bg_file=bg_file,
                             do_counter=False)
        p.add_motifs(
            ("clustering", (read_motifs(open(self.final_pwm)), "", "")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        #print "p.stats"
        #print p.stats
        #print "num_cluster"
        #print num_cluster
        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],
            "roc_auc": [0.6, 0.75, 0.9],
            "maxenr": [10, 20, 30],
            "enr_fdr": [4, 8, 12],
            "fraction": [0.4, 0.6, 0.8],
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        self.logger.info("creating report")

        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa,
                                  self.bg_file["fa"][bg], bg)

        # Location plots
        self.logger.debug("Creating localization plots")
        motifs = read_motifs(open(self.final_pwm), fmt="pwm")
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa,
                               motif,
                               lwidth,
                               outfile,
                               cutoff=s["cutoff_fdr"])

            s["stars"] = int(
                mean([star(s[x], all_stats[x])
                      for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm,
                            background,
                            stats=p.stats,
                            best_id=best_id)
        self._create_text_report(self.final_pwm, background)

        self.logger.info("finished")
        self.logger.info("output dir: %s", os.path.split(self.motif_report)[0])
        self.logger.info("report: %s", os.path.split(self.motif_report)[-1])
        #self.logger.info("Open %s in your browser to see your results." % (self.motif_report))

        if not (params["keep_intermediate"]):

            self.logger.debug(
                "Deleting intermediate files. Please specifify the -k option if you want to keep these files."
            )
            shutil.rmtree(self.tmpdir)

        self.logger.debug("Done")

        return self.motif_report
Exemplo n.º 37
0
def pp_predict_motifs(fastafile,
                      outfile,
                      analysis="small",
                      organism="hg18",
                      single=False,
                      background="",
                      tools=None,
                      job_server=None,
                      ncpus=8,
                      max_time=None,
                      stats_fg=None,
                      stats_bg=None):
    """Parallel prediction of motifs.

    Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to 
    use that, instead of this function directly.
    """
    if tools is None:
        tools = {}

    config = MotifConfig()

    if not tools:
        tools = dict([(x, 1)
                      for x in config.get_default_params["tools"].split(",")])

    #logger = logging.getLogger('gimme.prediction.pp_predict_motifs')

    wmin = 5
    step = 1
    if analysis in ["large", "xl"]:
        step = 2
        wmin = 6

    analysis_max = {"xs": 5, "small": 8, "medium": 10, "large": 14, "xl": 20}
    wmax = analysis_max[analysis]

    if analysis == "xs":
        sys.stderr.write("Setting analysis xs to small")
        analysis = "small"

    if not job_server:
        job_server = pool

    jobs = {}

    result = PredictionResult(
        outfile,
        fg_file=stats_fg,
        background=stats_bg,
        job_server=job_server,
    )

    # Dynamically load all tools
    toolio = [
        x[1]() for x in inspect.getmembers(
            tool_classes, lambda x: inspect.isclass(x) and issubclass(
                x, tool_classes.MotifProgram)) if x[0] != 'MotifProgram'
    ]

    # TODO:
    # Add warnings for running time: Weeder, GADEM

    ### Add all jobs to the job_server ###
    params = {
        'analysis': analysis,
        'background': background,
        "single": single,
        "organism": organism
    }

    # Tools that don't use a specified width usually take longer
    # ie. GADEM, XXmotif, MEME
    # Start these first.
    for t in [tool for tool in toolio if not tool.use_width]:
        if t.name in tools and tools[t.name]:
            logger.debug("Starting %s job", t.name)
            job_name = t.name
            jobs[job_name] = job_server.apply_async(
                _run_tool, (job_name, t, fastafile, params),
                callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)

    for t in [tool for tool in toolio if tool.use_width]:
        if t.name in tools and tools[t.name]:
            for i in range(wmin, wmax + 1, step):
                logger.debug("Starting %s job, width %s", t.name, i)
                job_name = "%s_width_%s" % (t.name, i)
                my_params = params.copy()
                my_params['width'] = i
                jobs[job_name] = job_server.apply_async(
                    _run_tool, (job_name, t, fastafile, my_params),
                    callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)

    logger.info("all jobs submitted")
    for job in jobs.values():
        job.get()

    result.wait_for_stats()
    ### Wait until all jobs are finished or the time runs out ###
    #    start_time = time()
    #    try:
    #        # Run until all jobs are finished
    #        while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time):
    #            pass
    #        if len(result.finished) < len(jobs.keys()):
    #            logger.info("Maximum allowed running time reached, destroying remaining jobs")
    #            job_server.terminate()
    #            result.submit_remaining_stats()
    #    ### Or the user gets impatient... ###
    #    except KeyboardInterrupt:
    #        # Destroy all running jobs
    #        logger.info("Caught interrupt, destroying all running jobs")
    #        job_server.terminate()
    #        result.submit_remaining_stats()
    #
    #
    #    if stats_fg and stats_bg:
    #        logger.info("waiting for motif statistics")
    #        n = 0
    #        last_len = 0
    #
    #
    #        while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])):
    #            if n >= 30:
    #                logger.debug("waited long enough")
    #                logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys()))
    #                for i,motif in enumerate(result.motifs):
    #                    if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats:
    #                        logger.debug("deleting %s", motif)
    #                        del result.motifs[i]
    #                break
    #            sleep(2)
    #            if len(result.stats.keys()) == last_len:
    #                n += 1
    #            else:
    #                last_len = len(result.stats.keys())
    #                n = 0
    #
    return result
Exemplo n.º 38
0
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server=None, ncpus=8, max_time=-1, stats_fg=None, stats_bg=None):
    """Parallel prediction of motifs.

    Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to 
    use that, instead of this function directly.
    """
    if tools is None:
        tools = {}

    config = MotifConfig()

    if not tools:
        tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")])
    
    #logger = logging.getLogger('gimme.prediction.pp_predict_motifs')

    wmin = 5 
    step = 1
    if analysis in ["large","xl"]:
        step = 2
        wmin = 6
    
    analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20}
    wmax = analysis_max[analysis]

    if analysis == "xs":
        sys.stderr.write("Setting analysis xs to small")
        analysis = "small"

    
    if not job_server:
        n_cpus = int(config.get_default_params()["ncpus"])
        job_server = Pool(processes=n_cpus, maxtasksperchild=1000) 
    
    jobs = {}
    
    result = PredictionResult(
                outfile, 
                fg_file=stats_fg, 
                background=stats_bg,
                job_server=job_server,
                )
    
    # Dynamically load all tools
    toolio = [x[1]() for x in inspect.getmembers(
                                                tool_classes, 
                                                lambda x: 
                                                        inspect.isclass(x) and 
                                                        issubclass(x, tool_classes.MotifProgram)
                                                ) if x[0] != 'MotifProgram']
    
    # TODO:
    # Add warnings for running time: Weeder, GADEM
        
    ### Add all jobs to the job_server ###
    params = {
            'analysis': analysis, 
            'background':background, 
            "single":single, 
            "organism":organism
            }
    
    # Tools that don't use a specified width usually take longer
    # ie. GADEM, XXmotif, MEME
    # Start these first.
    for t in [tool for tool in toolio if not tool.use_width]:
        if t.name in tools and tools[t.name]:
            logger.debug("Starting %s job", t.name)
            job_name = t.name
            jobs[job_name] = job_server.apply_async(
                        _run_tool,
                        (job_name, t, fastafile, params), 
                        callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)

    for t in [tool for tool in toolio if tool.use_width]:
        if t.name in tools and tools[t.name]:
            for i in range(wmin, wmax + 1, step):
                logger.debug("Starting %s job, width %s", t.name, i)
                job_name = "%s_width_%s" % (t.name, i)
                my_params = params.copy()
                my_params['width'] = i
                jobs[job_name] = job_server.apply_async(
                    _run_tool,
                    (job_name, t, fastafile, my_params), 
                    callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)
    
    logger.info("all jobs submitted")
    for job in jobs.values():
        job.get()

    result.wait_for_stats()
    ### Wait until all jobs are finished or the time runs out ###
#    start_time = time()    
#    try:
#        # Run until all jobs are finished
#        while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time):
#            pass
#        if len(result.finished) < len(jobs.keys()):
#            logger.info("Maximum allowed running time reached, destroying remaining jobs")
#            job_server.terminate()
#            result.submit_remaining_stats()
#    ### Or the user gets impatient... ###
#    except KeyboardInterrupt:
#        # Destroy all running jobs
#        logger.info("Caught interrupt, destroying all running jobs")
#        job_server.terminate()
#        result.submit_remaining_stats()
#        
#    
#    if stats_fg and stats_bg:
#        logger.info("waiting for motif statistics")
#        n = 0
#        last_len = 0 
#       
#    
#        while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])):
#            if n >= 30:
#                logger.debug("waited long enough")
#                logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys()))
#                for i,motif in enumerate(result.motifs):
#                    if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats:
#                        logger.debug("deleting %s", motif)
#                        del result.motifs[i]
#                break
#            sleep(2)
#            if len(result.stats.keys()) == last_len:
#                n += 1
#            else:
#                last_len = len(result.stats.keys())
#                n = 0
#    
    return result
Exemplo n.º 39
0
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats, best_id=None):
    """Create main gimme_motifs output html report."""
    if best_id is None:
        best_id = {}

    logger.debug("Creating graphical report")
    
    class ReportMotif(object):
        """Placeholder for motif stats."""
        pass

    config = MotifConfig()
    
    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)
    
    motifs = read_motifs(pwm, fmt="pwm")
    
    roc_img_file = "%s_roc.%s"

    dbpwm = config.get_default_params()["motif_db"]
    pwmdir = config.get_motif_dir()

    dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True)
    
    report_motifs = []
    for motif in motifs:
        
        rm = ReportMotif()
        rm.id = motif.id
        rm.id_href = {"href": "#%s" % motif.id}
        rm.id_name = {"name": motif.id}
        rm.img = {"src":  os.path.join("images", "%s.png" % motif.id)}
        motif.to_img(os.path.join(outdir, "images/{}.png".format(motif.id)), fmt="PNG")
        
        # TODO: fix best ID
        rm.best = "Gimme"#best_id[motif.id]

        rm.consensus = motif.to_consensus()
        rm.stars = int(np.mean(
                [stats[str(motif)][bg].get("stars", 0) for bg in background]
                ) + 0.5)

        rm.bg = {}
        for bg in background:
            rm.bg[bg] = {}
            this_stats = stats.get(str(motif), {}).get(bg)
            # TODO: fix these stats
            rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0)
            rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0)
            rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5)
            rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0)
            rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}
            rm.bg[bg][u"roc_img_link"] = {u"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}

        rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id}
        rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id}
        
        match_id = closest_match[motif.id][0]
        dbmotifs[match_id].to_img(os.path.join(outdir, "images/{}.png".format(match_id)), fmt="PNG")
    
        rm.match_img = {"src":  "images/{}.png".format(match_id)}
        rm.match_id = closest_match[motif.id][0]
        rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1]

        report_motifs.append(rm)
    
    total_report = os.path.join(outdir, "motif_report.html")

    star_img = os.path.join(config.get_template_dir(), "star.png")
    shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png"))

    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("report_template.jinja.html")
    # TODO: title
    result = template.render(
                    motifs=report_motifs, 
                    inputfile=inputfile, 
                    date=datetime.today().strftime("%d/%m/%Y"), 
                    version=__version__,
                    bg_types=list(background.keys()))

    with open(total_report, "wb") as f:
        f.write(result.encode('utf-8'))
Exemplo n.º 40
0
class MotifProgram(object):
    """Motif program base class."""

    config = MotifConfig()
    local_bin = None

    def _parse_params(self, params=None, needs_background=False):
        """
        Parse parameters.

        Combine default and user-defined parameters.
        """
        prm = self.default_params.copy()
        if params is not None:
            prm.update(params)

        # Background file is essential!
        if "background" in prm:
            # Absolute path, just to be sure
            prm["background"] = os.path.abspath(prm["background"])
        elif needs_background:
            raise ValueError("Background file needed!")

        return prm

    def _read_and_label_motifs(self, outfile, stdout, stderr, fmt="meme"):
        """Read output motifs and label with program name"""
        if not os.path.exists(outfile):
            stdout += "\nMotif file {0} not found!\n".format(outfile)
            stderr += "\nMotif file {0} not found!\n".format(outfile)
            return [], stdout, stderr

        motifs = read_motifs(outfile, fmt="meme")
        for m in motifs:
            m.id = "{0}_{1}".format(self.name, m.id)
        return motifs, stdout, stderr

    def bin(self):
        """
        Get the command used to run the tool.

        Returns
        -------
        command : str
            The tool system command.
        """
        if self.local_bin:
            return self.local_bin
        else:
            return self.config.bin(self.name)

    def dir(self):
        """
        Get the installation directory of the tool.

        Returns
        -------
        dir : str
            The tool directory.
        """
        return self.config.dir(self.name)

    def is_configured(self):
        """
        Check if the tool is configured.

        Returns
        -------
        is_configured : bool
            True if the tool is configured.
        """
        return self.config.is_configured(self.name)

    def is_installed(self):
        """
        Check if the tool is installed.

        Returns
        -------
        is_installed : bool
            True if the tool is installed.
        """
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, params=None, tmp=None):
        """
        Run the tool and predict motifs from a FASTA file.

        Parameters
        ----------
        fastafile : str
            Name of the FASTA input file.

        params : dict, optional
            Optional parameters. For some of the tools required parameters
            are passed using this dictionary.

        tmp : str, optional
            Directory to use for creation of temporary files.

        Returns
        -------
        motifs : list of Motif instances
            The predicted motifs.

        stdout : str
            Standard out of the tool.

        stderr : str
            Standard error of the tool.
        """
        if not self.is_configured():
            raise ValueError("%s is not configured" % self.name)

        if not self.is_installed():
            raise ValueError(
                "%s is not installed or not correctly configured" % self.name)

        self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp)
        fastafile = os.path.abspath(fastafile)

        try:
            return self._run_program(self.bin(), fastafile, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Exemplo n.º 41
0
	def write_config(self):
		from gimmemotifs.config import MotifConfig
		cfg = MotifConfig(use_config="cfg/gimmemotifs.cfg.example")

		data_dir = os.path.abspath(self.install_data)
		cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates'))
		cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes'))
		cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists'))
		cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index'))
		cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases'))
		cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg'))
		
		print 
		print "Trying to locate motif programs"
		
		MOTIF_CLASSES = ["MDmodule", "Meme", "Weeder", "Gadem", "MotifSampler", "Trawler", "Improbizer", "MoAn", "BioProspector"]
		available = []
		for program in MOTIF_CLASSES:
			m = eval(program)()
			cmd = m.cmd
			bin = which(cmd)
			if bin:
				print "Found %s in %s" % (m.name, bin)
				available.append(m.name)
				dir = None
				if program == "Weeder":
					dir = bin.replace("weederTFBS.out","")
				elif program == "Meme":
					dir = bin.replace("bin/meme", "")
				elif program == "Trawler":
					dir = bin.replace("bin/trawler.pl", "")
		
				cfg.set_program(m.name, {"bin":bin, "dir":dir})
			else:
				print "Couldn't find %s" % m.name
		
		print
		print "Trying to locate seqlogo"
		bin = which("seqlogo")
		if bin:
			print "Found seqlogo in %s" % (bin)
			cfg.set_seqlogo(bin)
		else:
			print "Couldn't find seqlogo"
		print
		
		DEFAULT_PARAMS["available_tools"] = ",".join(available)
		DEFAULT_PARAMS["tools"] = ",".join(available)
		cfg.set_default_params(DEFAULT_PARAMS)
		
		# Use a user-specific configfile if any other installation scheme is used
		if os.path.abspath(self.install_data) == "/usr/share":
			config_file = "/usr/share/gimmemotifs/%s" % CONFIG_NAME
		else:
			config_file = os.path.expanduser("~/.%s" % CONFIG_NAME)
		
		if os.path.exists(config_file):
			new_config = config_file + ".tmp"
			print "INFO: Configfile %s already exists!\n      Will create %s, which contains the new config.\n      If you want to use the newly generated config you can move %s to %s, otherwise you can delete %s.\n" % (config_file, new_config, new_config, config_file, new_config)

			f =  open(new_config, "wb")
			cfg.write(f)
		else: 
			print "Writing configuration file %s" % config_file
			f =  open(config_file, "wb")
			cfg.write(f)
		
		print "Edit %s to further configure GimmeMotifs." % config_file
Exemplo n.º 42
0
def diff(args):

    infiles = args.inputfiles.split(",")
    bgfile = args.bgfile
    outfile = args.outputfile
    pwmfile = args.pwmfile
    cutoff = args.cutoff
    genome = args.genome
    minenr = float(args.minenr)
    minfreq = float(args.minfreq)

    tmpdir = mkdtemp()
    
    # Retrieve FASTA clusters from BED file
    if len(infiles) == 1 and infiles[0].endswith("bed"):
        if not args.genome:
            sys.stderr.write("Can't convert BED file without genome!\n")
            sys.exit(1)

        clusters = {}
        for line in open(infiles[0]):
            vals = line.strip().split("\t")
            clusters.setdefault(vals[4], []).append(vals[:3])
        
        infiles = []
        
        config = MotifConfig()
        index_dir = config.get_index_dir()

        for cluster,regions in clusters.items():
            sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
            inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
            outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
            with open(inbed, "w") as f:
                for vals in regions:
                    f.write("{0}\t{1}\t{2}\n".format(*vals))
            track2fasta(os.path.join(index_dir, genome), inbed, outfa)
            infiles.append(outfa)
    
    pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
    motifs = [m for m in pwms.keys()]
    names = [os.path.basename(os.path.splitext(f)[0]) for f in infiles]
    
    # Get background frequencies
    nbg = float(len(Fasta(bgfile).seqs))
    bgcounts = get_counts(bgfile, pwms.values(), cutoff)
    bgfreq = [(bgcounts[m] + 0.01) / nbg for m in motifs]
    
    # Get frequences in input files
    freq = {}
    counts = {}
    for fname in infiles:
        c = get_counts(fname, pwms.values(), cutoff)
        n = float(len(Fasta(fname).seqs))
        freq[fname] = [(c[m] + 0.01) / n for m in motifs]
        counts[fname] = [c[m] for m in motifs]
    
    freq = np.array([freq[fname] for fname in infiles]).transpose()
    counts = np.array([counts[fname] for fname in infiles]).transpose()
    
    #for row in freq:
    #    print freq

    diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq)

    shutil.rmtree(tmpdir)
Exemplo n.º 43
0
class MotifComparer(object):
    """Class for motif comparison.
    
    Compare two or more motifs using a variety of metrics. Probably the best
    metric to compare motifs is seqcor. The implementation of this metric 
    is similar to the one used in Grau (2015), where motifs are scored 
    according to the Pearson correlation of the scores along sequence. In this
    case a de Bruijn of k=7 is used.

    Valid metrics are:
    seqcor - Pearson correlation of motif scores along sequence.
    pcc - Pearson correlation coefficient of motif PFMs.
    ed - Euclidean distance-based similarity of motif PFMs.
    distance - Distance-based similarity of motif PFMs.
    wic - Weighted Information Content, see van Heeringen 2011.
    chisq - Chi-squared similarity of motif PFMs.
    akl - Similarity based on average Kullback-Leibler similarity, see Mahony, 2011.
    ssd - Sum of squared distances of motif PFMs.
    
    Examples
    --------
    mc = MotifComparer()
    
    # Compare two motifs
    score, pos, strand = mc.compare_motifs(m1, m2, metric="seqcor")

    # Compare a list of motifs to another list of motifs
    mc.get_all_scores(motifs, dbmotifs, match, metric, combine)

    # Get the best match for every motif in a list of reference motifs
    get_closest_match(motifs, dbmotifs=None)
    """  
    def __init__(self):
        self.config = MotifConfig()
        self.metrics = ["pcc", "ed", "distance", "wic"]
        self.combine = ["mean", "sum"]
        self._load_scores()
        # Create a parallel python job server, to use for fast motif comparison
        

    def _load_scores(self):
        self.scoredist = {}
        for metric in self.metrics:
            self.scoredist[metric] = {"total": {}, "subtotal": {}}
            for match in ["total", "subtotal"]:
                for combine in ["mean"]:
                    self.scoredist[metric]["%s_%s" % (match, combine)] = {}
                    score_file = os.path.join(self.config.get_score_dir(), "%s_%s_%s_score_dist.txt" % (match, metric, combine))
                    if os.path.exists(score_file):
                        with open(score_file) as f:
                            for line in f:
                                l1, l2, m, sd = line.strip().split("\t")[:4]
                                self.scoredist[metric]["%s_%s" % (match, combine)].setdefault(int(l1), {})[int(l2)] = [float(m), float(sd)]
    
    def compare_motifs(self, m1, m2, match="total", metric="wic", combine="mean", pval=False):
        """Compare two motifs.
        
        The similarity metric can be any of seqcor, pcc, ed, distance, wic, 
        chisq, akl or ssd. If match is 'total' the similarity score is 
        calculated for the whole match, including positions that are not 
        present in both motifs. If match is partial or subtotal, only the
        matching psotiions are used to calculate the score. The score of
        individual position is combined using either the mean or the sum.

        Note that the match and combine parameters have no effect on the seqcor
        similarity metric.      

        Parameters
        ----------
        m1 : Motif instance
            Motif instance 1.

        m2 : Motif instance
            Motif instance 2.

        match : str, optional
            Match can be "partial", "subtotal" or "total". Not all metrics use 
            this.

        metric : str, optional
            Distance metric.

        combine : str, optional
            Combine positional scores using "mean" or "sum". Not all metrics
            use this.

        pval : bool, optional
            Calculate p-vale of match.
        
        Returns
        -------
        score, position, strand 
        """
        if metric == "seqcor":
            return seqcor(m1, m2)
        elif match == "partial":
            if pval:
                return self.pvalue(m1, m2, "total", metric, combine, self.max_partial(m1.pwm, m2.pwm, metric, combine))
            elif metric in ["pcc", "ed", "distance", "wic", "chisq", "ssd"]:
                return self.max_partial(m1.pwm, m2.pwm, metric, combine)
            else:
                return self.max_partial(m1.pfm, m2.pfm, metric, combine)

        elif match == "total":
            if pval:
                return self.pvalue(m1, m2, match, metric, combine, self.max_total(m1.pwm, m2.pwm, metric, combine))
            elif metric in ["pcc", 'akl']:
                # Slightly randomize the weight matrix
                return self.max_total(m1.wiggle_pwm(), m2.wiggle_pwm(), metric, combine)
            elif metric in ["ed", "distance", "wic", "chisq", "pcc", "ssd"]:
                return self.max_total(m1.pwm, m2.pwm, metric, combine)
            else:
                return self.max_total(m1.pfm, m2.pfm, metric, combine)
                
        elif match == "subtotal":
            if metric in ["pcc", "ed", "distance", "wic", "chisq", "ssd"]:
                return self.max_subtotal(m1.pwm, m2.pwm, metric, combine)
            else:
                return self.max_subtotal(m1.pfm, m2.pfm, metric, combine)
    

    def _check_length(self, l):
        # Set the length to a length represented in randomly generated JASPAR motifs 
        if l < 4:
            return 4
        if l == 13:
            return 14
        if l == 17:
            return 18
        if l == 19:
            return 20
        if l == 21:
            return 22
        if l > 22:
            return 30    
        return l    
    
    def pvalue(self, m1, m2, match, metric, combine, score):
        l1, l2 = len(m1.pwm), len(m2.pwm)
        
        l1 = self._check_length(l1)    
        l2 = self._check_length(l2)    
        
        m,s = self.scoredist[metric]["%s_%s" % (match, combine)][l1][l2]    
        
        try:
            [1 - norm.cdf(score[0], m, s), score[1], score[2]]
        except Exception as e:
            print("Error with score: {}\n{}".format(score, e))
            return [1, np.nan, np.nan]
        return [1 - norm.cdf(score[0], m, s), score[1], score[2]]

    def score_matrices(self, matrix1, matrix2, metric, combine):
        if metric in self.metrics and combine in self.combine:
            s = score(matrix1, matrix2, metric, combine)
            
            if s != s:
                return None
            else:
                return s
        
        else:
            if metric == "akl":
                func = akl
            elif metric == "chisq":
                func = chisq
            elif metric == "ssd":
                func = ssd
            else:
                try:
                    func = getattr(distance, metric)     
                except: 
                    raise Exception("Unknown metric '{}'".format(metric))

            scores = []
            for pos1,pos2 in zip(matrix1,matrix2):
                scores.append(func(pos1, pos2))
            if combine == "mean":
                return np.mean(scores)
            elif combine == "sum":
                return np.sum(scores)
            else:
                raise ValueError("Unknown combine")

    def max_subtotal(self, matrix1, matrix2, metric, combine):
        scores = []
        min_overlap = 4 
        
        if len(matrix1) < min_overlap or len(matrix2) < min_overlap:
            return self.max_total(matrix1, matrix2, metric, combine)
    
        #return c_max_subtotal(matrix1, matrix2, metric, combine)

        for i in range(-(len(matrix2) - min_overlap), len(matrix1) - min_overlap + 1):
            p1,p2 = self.make_equal_length_truncate(matrix1, matrix2, i)
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, 1])
    
        rev_matrix2 = [row[::-1] for row in matrix2[::-1]]
        for i in range(-(len(matrix2) - min_overlap), len(matrix1) - min_overlap + 1):
            p1,p2 = self.make_equal_length_truncate(matrix1, rev_matrix2, i)    
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, -1])
        
        if not scores:
            return []
        return sorted(scores, key=lambda x: x[0])[-1]
    
    def max_partial(self, matrix1, matrix2, metric, combine):

        scores = []
    
        for i in range(-(len(matrix2) -1), len(matrix1)):
            p1,p2 = self.make_equal_length_truncate_second(matrix1, matrix2, i)    
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, 1])
    
        rev_matrix2 = [row[::-1] for row in matrix2[::-1]]
        for i in range(-(len(matrix2) -1), len(matrix1)):
            p1,p2 = self.make_equal_length_truncate_second(matrix1, rev_matrix2, i)    
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, -1])
        
        if not scores:
            return []
        return sorted(scores, key=lambda x: x[0])[-1]

    def max_total(self, matrix1, matrix2, metric, combine):
        scores = []
    
        for i in range(-(len(matrix2) -1), len(matrix1)):
            p1,p2 = self.make_equal_length(matrix1, matrix2, i)    
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, 1])
    
        rev_matrix2 = [row[::-1] for row in matrix2[::-1]]
        for i in range(-(len(matrix2) -1), len(matrix1)):
            p1,p2 = self.make_equal_length(matrix1, rev_matrix2, i)    
            s = self.score_matrices(p1, p2, metric, combine)
            if s:
                scores.append([s, i, -1])
        
        if not scores:
            sys.stdout.write("No score {} {}".format(matrix1, matrix2))
            return []
        return sorted(scores, key=lambda x: x[0])[-1]
    
    def make_equal_length(self, pwm1, pwm2, pos, bg=None):
        if bg is None:
            bg = [0.25,0.25,0.25,0.25]
        
        p1 = pwm1[:]
        p2 = pwm2[:]
    
        if pos < 1:
            p1 = [bg for _ in range(-pos)] + p1
        else:
            p2 = [bg for _ in range(pos)] + p2
    
        diff = len(p1) - len(p2)
        if diff > 0:
            p2 += [bg for _ in range(diff)]
        elif diff < 0:
            p1 += [bg for _ in range(-diff)]
    
        return p1,p2
    
    def make_equal_length_truncate(self, pwm1, pwm2, pos):
        p1 = pwm1[:]
        p2 = pwm2[:]
    
        if pos < 0:
            p2 = p2[-pos:]
        elif pos > 0:
            p1 = p1[pos:]
        
        if len(p1) > len(p2):
            p1 = p1[:len(p2)]
        else:
            p2 = p2[:len(p1)]
        return p1, p2
    
    def make_equal_length_truncate_second(self, pwm1, pwm2, pos, bg=None):
        if bg is None:
            bg = [0.25,0.25,0.25,0.25]
        
        p1 = pwm1[:]
        p2 = pwm2[:]

        if pos < 0:
            p2 = p2[-pos:]
        else:
            p2 = [bg for _ in range(pos)] + p2
            
        diff = len(p1) - len(p2)
        if diff > 0:
            p2 += [bg for _ in range(diff)]
        elif diff < 0:
            p2 = p2[:len(p1)]
        return p1,p2

    def get_all_scores(self, motifs, dbmotifs, match, metric, combine, 
                            pval=False, parallel=True, trim=None, ncpus=None):
        """Pairwise comparison of a set of motifs compared to reference motifs.

        Parameters
        ----------
        motifs : list
            List of Motif instances.

        dbmotifs : list
            List of Motif instances.

        match : str
            Match can be "partial", "subtotal" or "total". Not all metrics use 
            this.

        metric : str
            Distance metric.

        combine : str
            Combine positional scores using "mean" or "sum". Not all metrics
            use this.

        pval : bool , optional
            Calculate p-vale of match.
        
        parallel : bool , optional
            Use multiprocessing for parallel execution. True by default.

        trim : float or None
            If a float value is specified, motifs are trimmed used this IC 
            cutoff before comparison.

        ncpus : int or None
            Specifies the number of cores to use for parallel execution.

        Returns
        -------
        scores : dict
            Dictionary with scores.
        """
        # trim motifs first, if specified
        if trim:
            for m in motifs:
                m.trim(trim)
            for m in dbmotifs:
                m.trim(trim)
        
        # hash of result scores
        scores = {}
        
        if parallel:    
            # Divide the job into big chunks, to keep parallel overhead to minimum
            # Number of chunks = number of processors available
            if ncpus is None:
                ncpus = int(MotifConfig().get_default_params()["ncpus"])

            pool = Pool(processes=ncpus, maxtasksperchild=1000)
 
            batch_len = len(dbmotifs) // ncpus
            if batch_len <= 0:
                batch_len = 1
            jobs = []
            for i in range(0, len(dbmotifs), batch_len): 
                # submit jobs to the job server
                
                p = pool.apply_async(_get_all_scores, 
                    args=(self, motifs, dbmotifs[i: i + batch_len], match, metric, combine, pval))
                jobs.append(p)
            
            pool.close()
            for job in jobs:
                # Get the job result
                result = job.get()
                # and update the result score
                for m1,v in result.items():
                    for m2, s in v.items():
                        if m1 not in scores:
                            scores[m1] = {}
                        scores[m1][m2] = s
        
            pool.join()
        else:
            # Do the whole thing at once if we don't want parallel
            scores = _get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval)
        
        return scores

    def get_closest_match(self, motifs, dbmotifs=None, match="partial", metric="wic",combine="mean", parallel=True, ncpus=None):
        """Return best match in database for motifs.

        Parameters
        ----------
        motifs : list or str
            Filename of motifs or list of motifs.

        dbmotifs : list or str, optional
            Database motifs, default will be used if not specified.

        match : str, optional

        metric : str, optional

        combine : str, optional

        ncpus : int, optional
            Number of threads to use.

        Returns
        -------
        closest_match : dict
        """

        if dbmotifs is None:
            pwm = self.config.get_default_params()["motif_db"]
            pwmdir = self.config.get_motif_dir()
            dbmotifs = os.path.join(pwmdir, pwm)
       
        motifs = parse_motifs(motifs)
        dbmotifs = parse_motifs(dbmotifs)

        dbmotif_lookup = dict([(m.id, m) for m in dbmotifs])

        scores = self.get_all_scores(motifs, dbmotifs, match, metric, combine, parallel=parallel, ncpus=ncpus)
        for motif in scores:
            scores[motif] = sorted(
                    scores[motif].items(), 
                    key=lambda x:x[1][0]
                    )[-1]
        
        for motif in motifs:
            dbmotif, score = scores[motif.id]
            pval, pos, orient = self.compare_motifs(
                    motif, dbmotif_lookup[dbmotif], match, metric, combine, True)
            
            scores[motif.id] = [dbmotif, (list(score) + [pval])]
        
        return scores

    def generate_score_dist(self, motifs, match, metric, combine):
        
        score_file = os.path.join(self.config.get_score_dir(), "%s_%s_%s_score_dist.txt" % (match, metric, combine))    
        f = open(score_file, "w")

        all_scores = {}
        for l in [len(motif) for motif in motifs]:
            all_scores[l] = {}

        sorted_motifs = {}
        for l in all_scores.keys():
            sorted_motifs[l] = [motif for motif in motifs if len(motif) == l]
        
        for l1 in all_scores.keys():
            for l2 in all_scores.keys():
                scores = self.get_all_scores(sorted_motifs[l1], sorted_motifs[l2], match, metric, combine)
                scores = [[y[0] for y in x.values() if y] for x in scores.values()]
                scores = np.array(scores).ravel()
                f.write("%s\t%s\t%s\t%s\n" % (l1, l2, np.mean(scores), np.std(scores)))

        f.close()    
Exemplo n.º 44
0
def cluster(args):

    revcomp = not args.single

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    trim_ic = 0.2
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True)
        clusters = tree.getResult()
    
    ids = []
    mc = MotifComparer()

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
    
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result.encode('utf-8'))

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for id in ids:
        f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
Exemplo n.º 45
0
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.
    
    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a 
        genomepy genome name.
    
    scoring : str
        "count" or "score"
    
    pwmfile : str, optional
        Specify a PFM file for scanning.
    
    ncpus : int, optional
        If defined this specifies the number of cores to use.
    
    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:,0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index
    
    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    s.set_background(genome=genome)
    
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions, normalize=True):
            scores.append(row)
        logger.info("done")
   
    motif_names = [m.id for m in read_motifs(pwmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemplo n.º 46
0
def genome(args):
    
    config = MotifConfig()
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    if not os.path.exists(args.fastadir):
        print "FASTA dir %s does not exist!" % (args.fastadir)
        sys.exit(1)
    
    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)
    
    fastadir = args.fastadir
    genomebuild = args.genomebuild
    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(args.indexdir, args.genomebuild)

    # Check for rights to write to directory

    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)
    
    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")
    
    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(
                url,
                tmp.name 
                )

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)

    else: 
        sys.stderr.write("No annotation found!")
  
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:
        
        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir, 
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        urllib.urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )
        
        if not check_genome_file(genome_fa):    
            continue
        
        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0} && rm {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(n)) as f:
                f.write("{}\n{}\n".format(n,s))
    
        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)
Exemplo n.º 47
0
def matched_gc_bedfile(bedfile, matchfile, genome, number):
    N_FRACTION = 0.1
    
    config = MotifConfig()
    index = os.path.join(config.get_index_dir(), genome)
    
    genome_size = os.path.join(index, "genome.size")
    genome_fa = os.path.join(index, "genome.fa")

    if not os.path.exists(genome_size) or not os.path.exists(genome_fa):
        raise RuntimeError, "genome files not found, please re-index {} "  \
                "with a recent version of gimme index".format(genome)

    try:
        fa = Fasta(matchfile)
        gc = [(seq.upper().count("C") + seq.upper().count("G")) / float(len(seq)) for seq in fa.seqs]
        lengths = [len(seq) for seq in fa.seqs]
    except:
        try:
            bed = pybedtools.BedTool(matchfile)
            gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)]
            lengths = [x.length for x in bed]
        except:
            sys.stderr.write("Please provide input file in BED or FASTA format\n")
            sys.exit(1)

    gc_hist,bins = np.histogram(gc, range=(0,1), bins=20)
    
    length = np.median(lengths)
    if np.std(lengths) > length * 0.05:
        sys.stderr.write("Sequences do not seem to be of equal length.\n")
        sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length))

    total = sum(gc_hist)
    
    if number:
        norm = number * gc_hist / (float(sum(gc_hist))) + 0.5
        inorm = norm.astype(np.int)

        s = np.argsort(norm - inorm)
        while sum(inorm) > number:
            if inorm[np.argmin(s)] > 0:
                inorm[np.argmin(s)] -= 1
            s[np.argmin(s)] = len(s)
        while sum(inorm) < number:
            inorm[np.argmax(s)] += 1
            s[np.argmax(s)] = 0
        gc_hist = inorm

    rnd = pybedtools.BedTool()
    out = open(bedfile, "w")
    #sys.stderr.write("Generating sequences\n")
    #sys.stderr.write("{}\n".format(number))
    
    r = rnd.random(l=length, n=number * 15, g=genome_size).nucleotide_content(fi=genome_fa)
   #sys.stderr.write("Retrieving\n")
    features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION]
    gc = [f[3] for f in features]
    
    #sys.stderr.write("Done\n")
    for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist):
        #sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end))
        if count > 0:
            rcount = 0
            for f in features:
                if (f[3] >= bin_start and f[3] < bin_end):
                    out.write("{}\t{}\t{}\n".format(*f[:3]))
                    rcount += 1
                    if rcount >= count:
                        break

            if count != rcount:
                sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count))
    out.close()
Exemplo n.º 48
0
    def run(self):
        from gimmemotifs.config import MotifConfig
        
        cfg = MotifConfig(use_config=self.build_cfg)

        data_dir = self.remove_nonsense(os.path.abspath(self.install_dir))
        
        cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates'))
        cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes'))
        cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists'))
        cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index'))
        cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases'))
        cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg'))
        cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools'))
        
        final_tools_dir = self.remove_nonsense(self.install_tools_dir)
        for program in MOTIF_CLASSES:
            m = eval(program)()
            if cfg.is_configured(m.name):
                bin = cfg.bin(m.name).replace(self.build_tools_dir, final_tools_dir) 
                dir = cfg.dir(m.name)
                if dir:
                    dir = dir.replace(self.build_tools_dir, final_tools_dir)
                cfg.set_program(m.name, {"bin":bin, "dir":dir})
            
        dir = cfg.get_seqlogo()
        dir = dir.replace(self.build_tools_dir, final_tools_dir)
        cfg.set_seqlogo(dir)

        # Use a user-specific configfile if any other installation scheme is used
#        if os.path.abspath(self.install_dir) == "/usr/share":
        config_file = os.path.join(self.install_dir, "gimmemotifs/%s" % CONFIG_NAME)
        self.outfiles = [config_file] 

        if os.path.exists(config_file):
            timestr = time.strftime("%Y%m%d-%H%M%S")        
            old_config = "{}.{}".format(config_file, timestr)
            shutil.move(config_file, old_config)
            dlog.info("INFO: Configfile %s already existed!", config_file)
            dlog.info("INFO: This config has been saved as %s", old_config)
         
        dlog.info("writing configuration file %s" % config_file)
        f =  open(config_file, "wb")
        cfg.write(f)
Exemplo n.º 49
0
class GimmeMotifs(object):
    NAME = "gimme_motifs"
    SCAN_THRESHOLD = "0.9"

    def __init__(self, name=None):
        self.config = MotifConfig()
        self.server = None

        if not name:
            name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y"))
        self.name = name

        # create a directory for all the intermediate and output files
        self._setup_output_dir(name)

        # setup logging
        self._setup_logging()
        self.logger.info("%s version %s", self.NAME, GM_VERSION)
        self.logger.info("output dir: %s", self.outdir)

        # setup the names of the intermediate and output files
        self._setup_filenames()

    def job_server(self):
        try:
            self.server.submit(job_server_ok)
        except Exception:
                self.server = self._get_job_server()
        return self.server

    def _setup_output_dir(self, name):

        if os.path.exists(name):
            sys.stderr.write("Output directory {} already exists!\n".format(name))
            sys.stderr.write("Resuming a previous run is not yet implemented. Please specify a different name,\n")
            sys.stderr.write("or delete this directory if you really want to overwrite it\n")
            #sys.exit(1)
        else:
            try:
                os.makedirs(name)
            except OSError:
                sys.stderr.write("Can't create output directory {}!\n".format(name))
                #sys.exit(1)

        self.outdir = name
        self.tmpdir = os.path.join(self.outdir, "intermediate_results")
        self.imgdir = os.path.join(self.outdir, "images")
        try:
            os.mkdir(self.tmpdir)
            os.mkdir(self.imgdir)
        except OSError:
            pass
        star_img = os.path.join(self.config.get_template_dir(), "star.png")
        shutil.copyfile(star_img, os.path.join(self.imgdir, "star.png"))

    def _setup_logging(self):
        self.logger = logging.getLogger('motif_analysis')
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = 0

        # nice format
        file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        screen_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

        # Log to file
        logfile = os.path.join(self.name, "%s.log" % self.NAME)
        fh = logging.FileHandler(logfile, "w")
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(file_formatter)
        self.logger.addHandler(fh)

        # Log to screen
        sh = logging.StreamHandler(sys.stdout)
        sh.setLevel(logging.INFO)
        sh.setFormatter(screen_formatter)
        self.logger.addHandler(sh)

        self.logger.debug("Logging started")
        self.logger.info("log: %s", logfile)

    def _setup_filenames(self):
        basename = os.path.split(self.name)[-1]
        self.basename = basename

        self.logger.debug("basename: {}".format(basename))
        # Um yes, there is a smarter way, I'm sure! ;)
        self.input_bed = os.path.join(self.tmpdir, "%s_peakinputfile.bed" % basename)

        self.prediction_bed    = os.path.join(self.tmpdir, "%s_prediction.bed" % basename)
        self.prediction_fa = os.path.join(self.tmpdir, "%s_prediction.fa" % basename)
        self.prediction_bg = os.path.join(self.tmpdir, "%s_prediction_background.fa" % basename)

        self.validation_bed = os.path.join(self.tmpdir, "%s_validation.bed" % basename)
        self.validation_fa = os.path.join(self.tmpdir, "%s_validation.fa" % basename)
        self.validation_gff = os.path.join(self.tmpdir, "%s_validation.gff" % basename)

        self.predicted_pfm = os.path.join(self.tmpdir, "%s_all_motifs.pfm" % basename)

        self.significant_pfm = os.path.join(self.tmpdir, "%s_significant_motifs.pfm" % basename)

        self.location_fa = os.path.join(self.tmpdir, "%s_validation_500.fa" % basename)
        self.location_pfile = os.path.join(self.tmpdir, "%s_localization_pvalue.txt" % basename)
        self.stats_file = os.path.join(self.tmpdir, "%s_stats.txt" % basename)
        self.ranks_file = os.path.join(self.tmpdir, "%s_ranks.txt" % basename)

        #self.cluster_dir = os.path.join(self.outdir, "cluster_report")
        self.validation_cluster_gff = os.path.join(self.tmpdir, "%s_validation_clustered.gff" % basename)
        self.cluster_pwm = os.path.join(self.tmpdir, "%s_clustered_motifs.pwm" % basename)
        self.final_pwm = os.path.join(self.outdir, "%s_motifs.pwm" % basename)
        self.cluster_report = os.path.join(self.outdir, "%s_cluster_report.html" % basename)
        self.motif_report = os.path.join(self.outdir, "%s_motif_report.html" % basename)
        self.text_report = os.path.join(self.outdir, "%s_motif_report.tsv" % basename)
        self.params_file = os.path.join(self.outdir, "%s_params.txt" % basename)

        # Data structures to hold the background file locations
        ftypes = {
            "bed": ".bed",
            "fa": ".fa",
            "gff": ".gff",
            "enrichment": "_enrichment.txt",
            "roc": "_significant_motifs_roc_metrics.txt",
            "cluster_gff": "_clustered.gff",
            "cluster_enrichment": "_enrichment_clustered.txt",
            "cluster_roc": "_roc_metrics_clustered.txt"
        }

        self.bg_file = dict([(t,{}) for t in ftypes.keys()])

        for bg in (FA_VALID_BGS + BED_VALID_BGS):
            for ftype, extension in ftypes.items():
                self.bg_file[ftype][bg] =  os.path.join(self.tmpdir, "%s_bg_%s%s" % (basename, bg, extension))

    def _is_parallel_enabled(self):
        return True

    def _get_job_server(self):
        return pool

    def _check_input(self, fname):
        """ Check if the inputfile is a valid bed-file """
        if not os.path.exists(fname):
            self.logger.error("Inputfile %s does not exist!", fname)
            sys.exit(1)

        for i, line in enumerate(open(fname)):
            if line.startswith("#") or line.startswith("track") or line.startswith("browser"):
                # comment or BED specific stuff
                pass
            else:
                vals = line.strip().split("\t")
                if len(vals) < 3:
                    self.logger.error("Expecting tab-seperated values (chromosome<tab>start<tab>end) on line %s of file %s", i + 1, fname)
                    sys.exit(1)
                try:
                    start, end = int(vals[1]), int(vals[2])
                except ValueError:
                    self.logger.error("No valid integer coordinates on line %s of file %s", i + 1, fname)
                    sys.exit(1)
                if len(vals) > 3:
                    try:
                        float(vals[3])
                    except ValueError:
                        pass
                        #self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file))

    def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)
        use_strand = bool(use_strand)

        self.logger.info("preparing input (BED)")

        # Set all peaks to specific width
        self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width)

    #    if not self.weird:
        write_equalwidth_bedfile(inputfile, width, self.input_bed)

        # Split input_bed in prediction and validation set
        self.logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                self.input_bed, self.prediction_bed, self.validation_bed)
        #if not self.weird:
        self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max)


        # Make fasta files
        index_dir = os.path.join(self.config.get_index_dir(), organism)
        self.logger.debug("Creating %s", self.prediction_fa)

        genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True)
        self.logger.debug("Creating %s", self.validation_fa)
        genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)

    def prepare_input_fa(self, inputfile, width=200, fraction=0.2, abs_max=1000):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)

        self.logger.info("preparing input (FASTA)")

        # Split inputfile in prediction and validation set
        self.logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)", 
                self.inputfile, self.prediction_fa, self.validation_fa)


        self.prediction_num, self.validation_num = divide_fa_file(self.inputfile, self.prediction_fa, self.validation_fa, fraction, abs_max)


    def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn("Are you sure about the Markov model? It seems too high!")
            else:
                order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
                self.logger.debug("Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                    "Creating random promoter background (%s, using genes in %s)", 
                    organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                        "User-specified background file %s does not exist!", 
                        bg_file)
                sys.exit(1)
            else:
                self.logger.info("Copying user-specified background file %s to %s.",
                        bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)

#    def filter_motifs(self, motif_ids, enrichmentfile, e_cutoff, p_cutoff):
#        filt_motifs = []
#        for line in open(enrichmentfile).readlines():
#            if not line.startswith("#"):
#                vals = line.strip().split("\t")
#                if vals[0] in motif_ids:
#                    p,e = float(vals[2]), float(vals[5])
#                    if p <= p_cutoff and e >= e_cutoff:
#                        filt_motifs.append(vals[0])
#        return filt_motifs


    def calculate_enrichment(self, motif_file, fg, bg):
        """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """

        self.logger.debug("Scanning background sequences with motifs")
        
        # define filenames 
        fnames = [(fg[0], fg[1])] + [x[:2] for x in bg]
        # scan and save as gff
        for infile,outfile in fnames:
            with open(outfile, "w") as f:
                for line in command_scan(infile, motif_file, 
                                        nreport=1, 
                                        cutoff=self.SCAN_THRESHOLD, 
                                        bed=False,
                                        scan_rc=True):
                    f.write(line + "\n")
        
        self.logger.debug("Calculating enrichment")
        enrichment_cmd = gff_enrichment
        num_sample = len(Fasta(fg[0]).items())
        for fasta_file, gff_file, out_file in bg:
            num_bg = len(Fasta(fasta_file).items())
            enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)

    def create_background(self, background=None, organism="hg18", width=200):
        if background is None:
            background = ["random"]

        nr_sequences = {}

        # Create background for motif prediction
        if "gc" in background:
            self._create_background("gc", self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width)
        else:
            self._create_background(background[0], self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width)

        # Get background fasta files
        for bg in background:
            nr_sequences[bg] = self._create_background(bg, self.validation_bed, self.validation_fa, self.bg_file["fa"][bg], organism=organism, width=width)

    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(
                    pfm_file, 
                    "total", 
                    "wic", 
                    "mean", 
                    True, 
                    threshold=float(threshold), 
                    include_bg=True,
                    progress=False
                    )
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster,members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG")
            ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)
                add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1,"+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
            ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]

        
        env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
        
        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s", 
                self.cluster_report)
        return clusters

    def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
        motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")])

        jobs = {}
        for id,m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_roc_values, (motifs[id],fg_fasta,bg_fasta,))

        roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png")

        for id in motifs.keys():
            error, x, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)

            roc_plot(roc_img_file % (id,name), x, y)

    def calculate_cluster_enrichment(self, pwm, background):
        fg = [self.validation_fa, self.validation_cluster_gff]
        bg = [[self.bg_file["fa"][bg_id], self.bg_file["gff"][bg_id], self.bg_file["cluster_enrichment"][bg_id]] for bg_id in background]
        self.calculate_enrichment(pwm, fg, bg)
        pass


    def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file):
        motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")])

        jobs = {}
        for id,m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_scores, (motifs[id],sample_fa,bg_fa,))

        all_auc = {}
        all_mncp = {}
        f = open(roc_file, "w")
        f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n")
        for id in motifs.keys():
            error, auc, mncp, max_f, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)
            f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y))
            all_auc[id] = auc
            all_mncp[id] = mncp

        f.close()

        return all_auc,all_mncp

    def _calc_report_values(self, pwm, background):
        self.logger.debug("Calculating final statistics for report")
        self.p = dict([(b,{}) for b in background])
        self.e = dict([(b,{}) for b in background])

        e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background])

        for bg in self.p.keys():
            for line in open(e_files[bg]).readlines():
                if not (line.startswith("#") or line.startswith("Motif\tSig")):
                    vals = line.strip().split("\t")
                    self.p[bg][vals[0]] = float(vals[2])
                    self.e[bg][vals[0]] = float(vals[5])

        self.auc = dict([(b,{}) for b in background])
        self.mncp = dict([(b,{}) for b in background])


        rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background])

        for bg in self.auc.keys():
            bg_fasta_file, roc_file = rocs[bg]
            self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file)

        motifs = read_motifs(open(pwm), fmt="pwm")
        self.closest_match = self.determine_closest_match(motifs)

    def _create_text_report(self, pwm, background):
        self.logger.debug("Creating text report")
        motifs = read_motifs(open(pwm), fmt="pwm")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        f = open(self.text_report, "w")
        header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background)
        #print header
        f.write("%s\n" % header)
        for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])):
            vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]]
            for bg in background:
                vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]]
            f.write("%s\n" % "\t".join([str(x) for x in vals]))
            #print "%s\n" % "\t".join([str(x) for x in vals])
        f.close()

    def print_params(self):
        f = open(self.params_file, "w")
        for param, value in self.params.items():
            f.write("%s\t%s\n" % (param, value))
        f.close()

    def _create_report(self, pwm, background, stats=None, best_id=None):
        if stats is None:
            stats = {}
        if best_id is None:
            best_id = {}


        self.logger.debug("Creating graphical report")
        class ReportMotif:
            pass

        motifs = read_motifs(open(pwm), fmt="pwm")
        for m,match in self.closest_match.items():
            match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        roc_img_file = "%s_%s_roc"
        report_motifs = []
        sorted_motifs = sorted(motifs,
                cmp= lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])
                )

        for motif in sorted_motifs:
            rm = ReportMotif()
            rm.id = motif.id
            rm.id_href = {"href": "#%s" % motif.id}
            rm.id_name = {"name": motif.id}
            rm.img = {"src":  os.path.join("images", "%s.png" % motif.id)}

            rm.best = best_id[motif.id]

            rm.consensus = motif.to_consensus()
            rm.stars = stats["%s_%s" % (motif.id, motif.to_consensus())]["stars"]

            rm.bg = {}
            for bg in background:
                rm.bg[bg] = {}
                rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0)
                rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0)
                rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id]
                rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id]
                rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}
                rm.bg[bg]["roc_img_link"] = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}

            rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id}
            rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id}
            rm.match_img = {"src":  "images/%s.png" % self.closest_match[motif.id][0].id}
            rm.match_id = self.closest_match[motif.id][0].id
            rm.match_pval = "%0.2e" % self.closest_match[motif.id][1]

            report_motifs.append(rm)

        total_report = self.motif_report
        
        env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("report_template.jinja.html")
        result = template.render(expname=self.basename, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
        
        f = open(total_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
            closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
        return closest_match

    def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None):
        num_cluster = {}
        best_id = {}
        out = open(pwm, "w")
        for i, (clus, singles) in enumerate(clusters):
            motifs = [clus] + singles
            tmp = NamedTemporaryFile(dir=mytmpdir())
            tmp2 = NamedTemporaryFile(dir=mytmpdir())
            for m in motifs:
                tmp.write("%s\n" % m.to_pwm())
            tmp.flush()
            auc,mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name)
            bla = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))
            for m in bla:
                self.logger.debug("sorted: %s %s %s", 
                        str(m), mncp[m.id], auc[m.id])

            self.logger.debug("end list")
            best_motif = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))[-1]
            old_id = best_motif.id
            best_motif.id = "GimmeMotifs_%d" % (i + 1)
            best_id[best_motif.id] = old_id.split("_")[0]
            num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles)
            if imgdir:
                best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG")
            out.write("%s\n" % best_motif.to_pwm())
            tmp.close()
            tmp2.close()
        out.close()
        return num_cluster, best_id

    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("starting full motif analysis")
        self.logger.debug("Using temporary directory {0}".format(mytmpdir()))

        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)

        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.debug("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.debug("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]

        self.logger.debug("Parameters:")
        for param, value in params.items():
            self.logger.debug("  %s: %s", param, value)

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.debug("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except Exception:
            # Leave it to BED
            pass

        index_msg = ( "No index found for genome {}! " 
                    "Has GimmeMotifs been configured correctly and is the " 
                    "genome indexed?" ).format(params["genome"])
        index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
        
        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info("Input type is FASTA, can't use background type '%s'", bg)
                if bg == "genomic":
                    if not os.path.exists(index_dir):
                        self.logger.error(index_msg)
                        sys.exit(1)
            background = [bg for bg in background if bg in FA_VALID_BGS]

        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            if not os.path.exists(index_dir):
                self.logger.error(index_msg)
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)    # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info("Input type is BED, can't use background type '%s'", bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]

        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except Exception:
                self.logger.debug("Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.debug("Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.debug("Invalid time limit for motif prediction, setting to no limit")
                self.max_time = None
        else:
                self.logger.debug("No time limit for motif prediction")

        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"])


             # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True)

        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"])

            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0])
            all_same_width = not(False in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!")

        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x in [y.strip() for y in  params["tools"].split(",")]) for x in params["available_tools"].split(",")])

        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("starting motif prediction (%s)", analysis)
        self.logger.info("tools: %s", 
                ", ".join([x for x in tools.keys() if tools[x]]))

        bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.debug("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file)

        motifs = result.motifs
        self.logger.info("predicted %s motifs", len(motifs))
        self.logger.debug("written to %s",self.predicted_pfm)

        if len(motifs) == 0:
            self.logger.info("no motifs found")
            sys.exit()

        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))
        
        self.logger.debug(result.stats)
        
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id))
                motifs.remove(motif)
        f.close()

        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0],1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]

            f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()


            #self.logger.debug("RANK: %s" % stat)
            #self.logger.debug("\t".join([str(x) for x in names]))
            #self.logger.debug("\t".join([str(x) for x in vals]))
            #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()
        self.logger.info("%s motifs are significant", nsig)
        self.logger.debug("written to %s", self.significant_pfm)

        if nsig == 0:
            self.logger.info("no significant motifs found")
            return

        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg])

        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"])

        # Determine best motif in cluster
        num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)
        

        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -

        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file)
        p.add_motifs(("clustering",  (read_motifs(open(self.final_pwm), fmt="pwm"), "","")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],
            "roc_auc": [0.6, 0.75, 0.9],
            "maxenr": [10, 20, 30],
            "enr_fdr": [4, 8, 12],
            "fraction": [0.4, 0.6, 0.8],
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        self.logger.info("creating report")

        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg)

        # Location plots
        self.logger.debug("Creating localization plots")
        motifs = read_motifs(open(self.final_pwm), fmt="pwm")
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"])

            s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id)
        self._create_text_report(self.final_pwm, background)
        
        self.logger.info("finished")
        self.logger.info("output dir: %s", os.path.split(self.motif_report)[0])
        self.logger.info("report: %s", os.path.split(self.motif_report)[-1])
        #self.logger.info("Open %s in your browser to see your results." % (self.motif_report))

        if not(params["keep_intermediate"]):

            self.logger.debug("Deleting intermediate files. Please specifify the -k option if you want to keep these files.")
            shutil.rmtree(self.tmpdir)

        self.logger.debug("Done")

        return self.motif_report
Exemplo n.º 50
0
 def __init__(self):
     self.config = MotifConfig()
     self.metrics = ["pcc", "ed", "distance", "wic"]
     self.combine = ["mean", "sum"]
     self._load_scores()
Exemplo n.º 51
0
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95):
    """ Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    
    inputfile : str
        File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'classic', 'ks', 'lasso', 
        'lightning', 'mara', 'rf'. Default is 'classic'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    cutoff : float, optional
        Cutoff for motif scanning
    
    Returns
    -------
    
    pandas DataFrame with motif activity
    """

    if scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")
    
    config = MotifConfig()

    m2f = None
    
    # read data
    df = pd.read_table(inputfile, index_col=0)

    if method in CLUSTER_METHODS:
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                    "columns should all be numeric for {}".format(method))
        if method not in VALUE_METHODS:
            raise ValueError("method {} not valid".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
        
        if pwmfile is None:
            raise ValueError("no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0)

        # initialize scanner
        s = Scanner()
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            for row in s.count(list(df.index), cutoff=cutoff):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0)   

    motifs = motifs.loc[df.index]
    
    clf = None
    if method == "ks":
        clf = KSMoap()
    if method == "mwu":
        clf = MWMoap()
    if method == "rf":
        clf = RFMoap()
    if method == "lasso":
        clf = LassoMoap()
    if method == "lightning":
        clf = LightningMoap()
    if method == "mara":
        clf = MaraMoap()
    if method == "more":
        clf = MoreMoap()
    if method == "classic":
        clf = ClassicMoap()

    clf.fit(motifs, df)
    
    if outfile:
        with open(outfile, "w") as f:
            f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))
        
        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemplo n.º 52
0
def create_background_file(outfile,
                           bg_type,
                           fmt="fasta",
                           size=None,
                           genome=None,
                           inputfile=None,
                           number=10000):
    """
    Create a background file for motif analysis.

    Parameters
    ----------
    outfile : str
        Name of the output file.
    bg_type : str
        Type of background (gc, genomic, random or promoter).
    fmt : str, optional
        Either 'fasta' or 'bed'.
    size : int, optional
        Size of the generated sequences, is determined from the inputfile if not
        given.
    genome : str, optional
    inputfile : str, optional
    number : int, optional
    """
    fmt = fmt.lower()
    if fmt in ["fa", "fsa"]:
        fmt = "fasta"

    if bg_type not in BG_TYPES:
        print("The argument 'type' should be one of: %s" %
              (",".join(BG_TYPES)))
        sys.exit(1)

    if fmt == "bed" and bg_type == "random":
        print("Random background can only be generated in FASTA format!")
        sys.exit(1)

    if bg_type == "gc" and not inputfile:
        print("need a FASTA formatted input file for background gc")
        sys.exit(1)

    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()

    # Genome index location for creation of FASTA files
    if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta":
        if genome is None:
            print("Need a genome to create background file")
            sys.exit(1)
        Genome(genome)

    if bg_type in ["promoter"]:
        # Gene definition
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(),
                                     "{}.bed".format(genome))

        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}".format(genome))
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            sys.exit(1)

    # Number of sequences
    if number is None:
        if inputfile:
            number = number_of_seqs_in_file(inputfile)
            logger.info("Using %s of background sequences based on input file",
                        number)
        else:
            number = 10000
            logger.info(
                "Number of background sequences not specified, using 10,000 sequences"
            )

    if bg_type == "random":
        f = Fasta(inputfile)
        m = MarkovFasta(f, n=number, k=1)
        m.writefasta(outfile)
    elif bg_type == "gc":
        if fmt == "fasta":
            m = MatchedGcFasta(inputfile, genome, number=number, size=size)
            m.writefasta(outfile)
        else:
            matched_gc_bedfile(outfile, inputfile, genome, number, size=size)
    else:
        if size is None:
            size = np.median(
                [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs])
        if bg_type == "promoter":
            if fmt == "fasta":
                m = PromoterFasta(gene_file, genome, size=size, n=number)
                m.writefasta(outfile)
            else:
                create_promoter_bedfile(outfile, gene_file, size, number)
        elif bg_type == "genomic":
            if fmt == "fasta":
                m = RandomGenomicFasta(genome, size, number)
                m.writefasta(outfile)
            else:
                create_random_genomic_bedfile(outfile, genome, size, number)
Exemplo n.º 53
0
def background(args):

    inputfile = args.inputfile
    out = args.outputfile
    bg_type = args.bg_type
    outformat = args.outformat.lower()
    length = args.length

    if not bg_type in BG_TYPES:
        print "The argument 'type' should be one of: %s" % (",".join(BG_TYPES))
        sys.exit(1)

    if outformat == "bed" and bg_type == "random":
        print "Random background can only be generated in FASTA format!"
        sys.exit(1)
        
    if bg_type == "gc" and not inputfile:
        print "need a FASTA formatted input file for background gc"
        sys.exit(1)
    
    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()
        
    # Genome index location for creation of FASTA files
    index_dir = os.path.join(config.get_index_dir(), args.genome)
    if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta":
        if not os.path.exists(index_dir):
            print "Index for %s does not exist. Has the genome been indexed for use with GimmeMotifs?" % args.genome
            sys.exit(1)
        
    # Gene definition
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % args.genome)
    if bg_type in ["promoter"]:
        if not os.path.exists(gene_file):
            print "Can't find gene definition for %s (%s). See GimmeMotifs documentation on how to add gene files." % (args.genome, gene_file)
            sys.exit(1)
    
    # Number of sequences
    number = None
    if args.number:
        number = args.number
    elif inputfile:
        number = number_of_seqs_in_file(inputfile)
    else:
        sys.stderr.write("please provide either a number or an inputfile\n")
        sys.exit(1)
    
    if bg_type == "random":
        f = Fasta(inputfile)
        m = bg.MarkovFasta(f, n=number, k=args.markov_order)
        m.writefasta(out)
    elif bg_type == "gc":
        if outformat in ["fasta", "fa"]:
            m = bg.MatchedGcFasta(inputfile, args.genome, number=number)
            m.writefasta(out)
        else:
            bg.matched_gc_bedfile(out, inputfile, args.genome, number)
    elif bg_type == "promoter":
        if outformat in ["fasta", "fa"]:
            m = bg.PromoterFasta(gene_file, index_dir, length=length, n=number)
            m.writefasta(out)
        else:
            bg.create_promoter_bedfile(out, gene_file, length, number)
    elif bg_type == "genomic":
        if outformat in ["fasta", "fa"]:
            m = bg.RandomGenomicFasta(index_dir, length, number)
            m.writefasta(out)
        else:
            bg.create_random_genomic_bedfile(out, index_dir, length, number)
Exemplo n.º 54
0
def create_background(
    bg_type,
    fafile,
    outfile,
    genome="hg18",
    size=200,
    nr_times=10,
    custom_background=None,
):
    """Create background of a specific type.

    Parameters
    ----------
    bg_type : str
        Name of background type.

    fafile : str
        Name of input FASTA file.

    outfile : str
        Name of output FASTA file.

    genome : str, optional
        Genome name.

    size : int, optional
        Size of regions.

    nr_times : int, optional
        Generate this times as many background sequences as compared to
        input file.

    Returns
    -------
    nr_seqs  : int
        Number of sequences created.
    """
    size = int(size)
    config = MotifConfig()
    fg = Fasta(fafile)

    if bg_type in ["genomic", "gc"]:
        if not genome:
            logger.error("Need a genome to create background")
            sys.exit(1)

    if bg_type == "random":
        f = MarkovFasta(fg, k=1, n=nr_times * len(fg))
        logger.debug("Random background: %s", outfile)
    elif bg_type == "genomic":
        logger.debug("Creating genomic background")
        f = RandomGenomicFasta(genome, size, nr_times * len(fg))
    elif bg_type == "gc":
        logger.debug("Creating GC matched background")
        f = MatchedGcFasta(fafile, genome, nr_times * len(fg))
        logger.debug("GC matched background: %s", outfile)
    elif bg_type == "promoter":
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome)
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}")
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            raise ValueError()

        logger.info(
            "Creating random promoter background (%s, using genes in %s)",
            genome,
            gene_file,
        )
        f = PromoterFasta(gene_file, genome, size, nr_times * len(fg))
        logger.debug("Random promoter background: %s", outfile)
    elif bg_type == "custom":
        bg_file = custom_background
        if not bg_file:
            raise IOError("Background file not specified!")

        if not os.path.exists(bg_file):
            raise IOError("Custom background file %s does not exist!", bg_file)
        else:
            logger.info("Copying custom background file %s to %s.", bg_file,
                        outfile)
            f = Fasta(bg_file)
            median_length = np.median([len(seq) for seq in f.seqs])
            if median_length < (size * 0.95) or median_length > (size * 1.05):
                logger.warn(
                    "The custom background file %s contains sequences with a "
                    "median size of %s, while GimmeMotifs predicts motifs in sequences "
                    "of size %s. This will influence the statistics! It is recommended "
                    "to use background sequences of the same size.",
                    bg_file,
                    median_length,
                    size,
                )

    f.writefasta(outfile)
    return len(f)
Exemplo n.º 55
0
def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemplo n.º 56
0
    def run(self):
        from gimmemotifs.config import MotifConfig
        
        cfg = MotifConfig(use_config=self.build_cfg)

        data_dir = self.remove_nonsense(os.path.abspath(self.install_dir))
        
        cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates'))
        cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes'))
        cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists'))
        cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index'))
        cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases'))
        cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg'))
        cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools'))
        
        final_tools_dir = self.remove_nonsense(self.install_tools_dir)
        for program in MOTIF_CLASSES:
            m = eval(program)()
            if cfg.is_configured(m.name):
                bin = cfg.bin(m.name).replace(self.build_tools_dir, final_tools_dir) 
                dir = cfg.dir(m.name)
                if dir:
                    dir = dir.replace(self.build_tools_dir, final_tools_dir)
                cfg.set_program(m.name, {"bin":bin, "dir":dir})
            
        dir = cfg.get_seqlogo()
        dir = dir.replace(self.build_tools_dir, final_tools_dir)
        cfg.set_seqlogo(dir)

        # Use a user-specific configfile if any other installation scheme is used
#        if os.path.abspath(self.install_dir) == "/usr/share":
        config_file = os.path.join(self.install_dir, "gimmemotifs/%s" % CONFIG_NAME)
        self.outfiles = [config_file] 


        if os.path.exists(config_file):
            new_config = config_file + ".tmp"
            dlog.info("INFO: Configfile %s already exists!" % config_file)
            dlog.info("INFO: Will create %s, which contains the new config." % new_config)
            dlog.info("INFO: If you want to use the newly generated config you can move %s to %s, otherwise you can delete %s.\n" % (new_config, config_file, new_config))

            f =  open(new_config, "wb")
            cfg.write(f)
        else: 
            dlog.info("writing configuration file %s" % config_file)
            f =  open(config_file, "wb")
            cfg.write(f)
        
        if os.path.abspath(self.install_dir) != "/usr/share":
            dlog.info("PLEASE NOTE: GimmeMotifs is installed in a non-standard location.")
            dlog.info("PLEASE NOTE: This is fine, but then every user should have a file called ~/.gimmemotifs.cfg")
            dlog.info("PLEASE NOTE: The file %s is fully configured during install and can be used for that purpose." % config_file)
Exemplo n.º 57
0
    def run(self):
        if not os.path.exists(self.build_cfg):
            os.mkdir(self.build_cfg)

        from gimmemotifs.config import MotifConfig
        cfg = MotifConfig(use_config="cfg/gimmemotifs.cfg.base")
        
        dlog.info("locating motif programs")
        available = []
        for program in MOTIF_CLASSES:
            # Get class
            m = eval(program)()
            cmd = m.cmd
            
            ### ugly, fixme :)
            if cmd == "trawler.pl":
                cmd = "trawler/bin/trawler.pl"
            if cmd == "ChIPMunk.sh":
                cmd = "ChIPMunk/ChIPMunk.sh"
            if cmd == "hms":
                cmd = "HMS/hms"

            bin = ""
            if cmd == "/bin/false":
                # motif db
                bin = "/bin/false"    
            elif os.path.exists(os.path.join(self.build_tools_dir, cmd)):
                bin = os.path.join(self.build_tools_dir, cmd)
                dlog.info("using included version of %s: %s" % (program, bin))
            else:
                ### ugly, fixme :)
                if cmd == "trawler/bin/trawler.pl":
                    cmd = "trawler.pl"
                if     cmd == "ChIPMunk/ChIPMunk.sh":
                    cmd = "ChIPMunk.sh"
                if cmd == "HMS/hms":
                    cmd = "hms"

                if program in MOTIF_BINS.keys():
                    dlog.info("could not find compiled version of %s" % program)
                bin = which(cmd)
                if bin:
                    dlog.info("using installed version of %s: %s" % (program, bin))
                else:
                    dlog.info("not found: %s" % program)
            
            ### Some more ugly stuff
            if bin:
                dir = bin.replace(m.cmd,"")
                if program == "Weeder":
                    dir = bin.replace("weederTFBS.out","")
                elif program == "Meme":
                    dir = bin.replace("bin/meme.bin", "").replace("meme.bin", "")
                elif program == "Trawler":
                    dir = bin.replace("bin/trawler.pl", "")
                elif program == "ChIPMunk":
                    dir = bin.replace("ChIPMunk.sh", "")

                available.append(m.name)
                cfg.set_program(m.name, {"bin":bin, "dir":dir})

        # Weblogo
        bin = ""
        seq_included = os.path.join(self.build_tools_dir, "seqlogo")
        if os.path.exists(seq_included):
            bin = seq_included
            dlog.info("using included version of weblogo: %s" % seq_included)
        else:
            bin = which("seqlogo")
            dlog.info("using installed version of seqlogo: %s" % (bin))
        if bin:
            cfg.set_seqlogo(bin)
        else:
            dlog.info("couldn't find seqlogo")
        
        # Set the available tools in the config file
        DEFAULT_PARAMS["available_tools"] = ",".join(available)
        
        for tool in available:
            if tool in LONG_RUNNING:
                dlog.info("PLEASE NOTE: %s can take a very long time to run on large datasets. Therefore it is not added to the default tools. You can always enable it later, see documentation for details" % tool)
                available.remove(tool)

        DEFAULT_PARAMS["tools"] = ",".join(available)
        cfg.set_default_params(DEFAULT_PARAMS)

        # Write (temporary) config file
        config_file = os.path.join(self.build_cfg, "%s" % CONFIG_NAME)
        dlog.info("writing (temporary) configuration file: %s" % config_file)
        f = open(config_file, "wb")
        cfg.write(f)
        f.close()