Пример #1
0
def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    to_cluster = True
    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    if os.path.exists(outfile): os.remove(outfile)

    for motiffile in motiffiles:
        of = IOTools.openFile(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run()
Пример #2
0
def getMinimumMappedReads( infiles ):
    '''find the minimum number of mapped reads in infiles.'''
    v = []
    for infile in infiles:
        x = getMappedReads( infile )
        if x: v.append( x )
    if len(v) == 0:
        raise P.PipelineError( "could not find mapped reads in files %s" % (str(infiles)))
    return min(v)
Пример #3
0
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(("ER%i" % x,
                       re.compile(motif + "." * x + reverse_motif,
                                  re.IGNORECASE)))

    db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(IOTools.openFile(controlfile, "r"),
                                           motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n"
    )
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * ncontrol / (
                ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % \
                    (motif,
                     db_counts[motif],
                     control_counts[motif],
                     db_seqcounts[motif],
                     IOTools.prettyPercent( db_seqcounts[motif], ndb),
                     control_seqcounts[motif],
                     IOTools.prettyPercent( control_seqcounts[motif], ncontrol),
                     fold) )
Пример #4
0
def buildAnnotatorAnnotations(tmpdir,
                              outfile,
                              annotations=None,
                              bedfiles=None,
                              gfffiles=None,
                              gofile=None):
    '''write annotations in annotator format.
    '''

    tmpannotations = os.path.join(tmpdir, "annotations")
    to_cluster = True
    job_options = "-l mem_free=4000M"

    if annotations == "architecture":
        statement = '''
         cat %(promotors)s %(annotation)s 
         | python %(scriptsdir)s/gff2annotator2tsv.py 
         	--section=annotations-gff 
         	--log=%(outfile)s.log 
                --remove-regex='%(annotator_remove_pattern)s'
         > %(tmpannotations)s
        '''
    elif annotations == "go":
        statement = '''
        python %(scriptsdir)s/gff2annotator2tsv.py 
        --section=annotations-go 
        --input-filename-map=<(cut -f 2,4 < %(gofile)s) 
        --log=%(outfile)s.log
        --remove-regex='%(annotator_remove_pattern)s'
        < %(annotator_geneterritories)s  
        > %(tmpannotations)s
        '''
    elif bedfiles:
        bedfiles = " ".join(bedfiles)
        statement = '''
        cat %(bedfiles)s 
        | python %(scriptsdir)s/bed2annotator2tsv.py 
        --max-length=0 
        --merge 
        --section=annotations 
        --log=%(outfile)s.log 
        > %(tmpannotations)s
        '''
    else:
        raise P.PipelineError("unknown annotations '%s'" % annotations)

    P.run()

    return tmpannotations
def buildAnnotatorAnnotations(tmpdir,
                              outfile,
                              annotations=None,
                              bedfiles=None,
                              gofile=None):
    '''write annotations.'''

    tmpannotations = os.path.join(tmpdir, "annotations")
    to_cluster = True
    job_options = "-l mem_free=4000M"

    if annotations == "architecture":
        statement = '''
         cat %(promotors)s %(annotation)s |\
         python %(scriptsdir)s/gff2annotator.py \
         	--section=annotations-gff \
         	--log=%(outfile)s.log \
         > %(tmpannotations)s
        '''
    elif annotations == "go":
        statement = '''
        cat %(annotator_geneterritories)s |\
        python %(scriptsdir)s/gff2annotator.py \
        --section=annotations-go \
        --input-filename-map=<(cut -f 2,4 < %(gofile)s) \
        --log=%(outfile)s.log \
        > %(tmpannotations)s
        '''
    elif bedfiles:
        bedfiles = " ".join(bedfiles)
        statement = '''
        cat %(bedfiles)s |\
        python %(scriptsdir)s/bed2annotator.py \
        --max-length=0 \
        --merge \
        --section=annotations \
        --log=%(outfile)s.log \
        > %(tmpannotations)s
        '''
    else:
        raise P.PipelineError("unknown annotations '%s'" % workspace)

    P.run()

    return tmpannotations
Пример #6
0
    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast
def getExpressionMeasurements(track):
    '''return a tuple (probesets, treatments, controls)
    where probesets is an array of n probsets and
    treatments/controls are tuples of n-length arrays.'''

    control = getExpressionControl(track)
    if track == control:
        raise P.PipelineError("track (%s) == control (%s)" % (track, control))

    replicates_sample = getExpressionReplicates(track)
    assert len(replicates_sample) > 0, "no replicates for sample %s" % track

    replicates_control = getExpressionReplicates(control)
    assert len(
        replicates_control) > 0, "no replicates for control %s" % control

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()

    track_samples = ",".join(["a.%s" % x for x in replicates_sample])
    control_samples = ",".join(["b.%s" % x for x in replicates_control])

    cc = dbhandle.cursor()
    statement = """SELECT a.cluster_id, 
                        %(track_samples)s, 
                        %(control_samples)s
                        FROM %(track)s_levels AS a, 
                             %(control)s_levels AS b 
                        WHERE a.cluster_id = b.cluster_id""" % locals()

    cc.execute(statement)

    r = zip(*cc.fetchall())
    nreplicates_sample = len(replicates_sample)
    treatments = r[1:nreplicates_sample + 1]
    controls = r[nreplicates_sample + 1:]

    return (control, r[0], treatments, controls)
Пример #8
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    to_cluster = True

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile): os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Пример #9
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tablename = outfile[:-len(".load")]
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(":: motif = (\S+) ::",
                             lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.iteritems():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            #start, end = int(start), int(end)
            #match.start += start
            #match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0: mmax = ""
            else: mmax = max(c)

            tmpfile.write("\t".join(
                map(str, (motif, id, nmatches, score,
                          ",".join(map(str, scores)), len(c), mmax))) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              -b sqlite \
              --index=id \
              --index=motif \
              --index=id,motif \
              --table=%(tablename)s \
              --map=base_qualities:text \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
def buildAnnotatorWorkSpace(tmpdir,
                            outfile,
                            workspaces=("genomic", ),
                            gc_control=False):
    '''write genomic workspace.'''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    tmpworkspaces = []

    if gc_control:
        tmpworkspace = os.path.join(tmpdir, "workspace_gc")
        tmpsynonyms = os.path.join(tmpdir, "synonyms")
        tmpworkspaces.append(tmpworkspace)

        statement = '''
        awk '{ printf("%%s\\t%%s\\t%%s\\t%%s.%%s\\n", $1,$2,$3,$1,$4)}' < %(annotator_gc)s |\
        python %(scriptsdir)s/bed2gff.py |\
 	python %(scriptsdir)s/gff2annotator.py \
		--section=workspace \
                --output-filename-synonyms=%(tmpsynonyms)s \
                --max-length=0 \
		--log=%(outfile)s.log \
        > %(tmpworkspace)s'''

        P.run()
    else:
        tmpsynonyms = None

    for workspace in workspaces:
        tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace)

        if workspace == "genomic":
            statement = '''
            python %(scriptsdir)s/gff2annotator.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
            < %(genome)s.gff > %(tmpworkspace)s
            '''
        elif workspace == "promotors":
            statement = '''
            python %(scriptsdir)s/gff2annotator.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
            < %(promotors)s > %(tmpworkspace)s
            '''
        elif workspace == "promotors-go":
            # promotors with GO categories
            statement = '''cat < %(promotors)s |\
            python %(scriptsdir)s/gtf2gtf.py \
            --filter=gene \
            --apply=<(cut -f 2 < %(gofile)s | sort | uniq ) |\
            python %(scriptsdir)s/gff2annotator.py \
            --section=workspace \
            --max-length=0 \
            --log=%(outfile)s.log \
            > %(tmpworkspace)s
            '''
        elif workspace == "gene-territories":
            statement = '''
            python %(scriptsdir)s/gff2annotator.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
            < %(annotator_geneterritories)s > %(tmpworkspace)s
            '''
        elif workspace == "mappable":
            statement = '''
            python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s |\
            python %(scriptsdir)s/gff2annotator.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
            > %(tmpworkspace)s
            '''
        else:
            raise P.PipelineError("unknown workspace '%s'" % workspace)

        P.run(**dict(locals().items() + PARAMS.items()))
        tmpworkspaces.append(tmpworkspace)

    return tmpworkspaces, tmpsynonyms
Пример #11
0
def buildWorkSpace(outfile, workspace):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    intronic
       introns (requires annotator_regions to be set)
    exonic
       exonic (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    workspace = workspace.lower()

    if workspace == "genomic":
        P.checkParameter("genome")

        statement = '''
        python %(scriptsdir)s/index2bed.py 
                --genome=%(genome)s 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace in ("intergenic", "intronic", "cds"):

        P.checkParameter("enrichment_regions")

        workspace_upper = workspace.upper()

        statement = '''
        gunzip < %(enrichment_regions)s 
        | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } 
               ($3 == "%(workspace)s" 
               || $3 == "%(workspace_upper)s") 
               && !( $1 ~ /%(enrichment_remove_pattern)s/)
               { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }'
        > %(outfile)s
        '''
    elif workspace == "unknown":

        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "intronic" || $3 == "intergenic" )' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "known":
        P.checkParameter("enrichment_regions")
        statement = '''
        awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
        < %(enrichment_regions)s
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''

    elif workspace == "alignable":

        P.checkParameter("enrichment_alignment")
        statement = '''gunzip
        < %(enrichment_alignment)s 
        | sort -k10,10 
        | awk '$10 !~ /%(enrichment_remove_pattern)s/ \
            {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
            printf("\\t(%%i,%%i)", $12,$13); }; \
        END {printf ("\\n");}'\
        > %(outfile)s
        '''

    elif workspace == "gene-territories":

        P.checkParameter("enrichment_geneterritories")
        statement = '''
        python %(scriptsdir)s/gff2enrichment.py \
                --section=workspace \
                --max-length=0 \
                --log=%(outfile)s.log \
                --remove-regex='%(enrichment_remove_pattern)s'
        < %(enrichment_geneterritories)s > %(outfile)s
        '''

    elif workspace == "mappable":

        P.checkParameter("enrichment_mappability")
        statement = '''
        python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s 
        | python %(scriptsdir)s/gff2enrichment.py 
                --section=workspace 
                --max-length=0 
                --log=%(outfile)s.log 
                --remove-regex='%(enrichment_remove_pattern)s'
        > %(outfile)s
        '''
    else:
        raise P.PipelineError("unknown workspace '%s'" % workspace)

    P.run()
Пример #12
0
def buildAnnotatorWorkSpace(tmpdir,
                            outfile,
                            workspaces=("genomic", ),
                            gc_control=False):
    '''write genomic workspace.

    Available workspaces are:

    genomic
       the full genome
    all
       is ignored
    intronic
       introns (requires annotator_regions to be set)
    intergenic
       introns (requires annotator_regions to be set)
    geneterritories
       introns (requires annotator_geneterritories to be set)
    mappable
       mappable part of genome (requires annotator_mappability to be set )
    alignable
       only the alignable part of a genome (requires annotator_alignment to be set)


    If ``gc_control`` is True, the chromosomes will be divided into isochores
    (requiers the paramater ``annotator_gc_workspace`` to be set).
    '''

    to_cluster = True
    job_options = "-l mem_free=4000M"

    tmpworkspaces = []

    if gc_control:
        P.checkParameter("annotator_gc_workspace")

        tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms"
        tmpworkspaces.append(PARAMS["annotator_gc_workspace"])
    else:
        tmpsynonyms = None

    for workspace in workspaces:

        tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace)

        if workspace == "all":
            continue
        elif workspace == "genomic":
            P.checkParameter("genome")

            statement = '''
            python %(scriptsdir)s/index2gff.py 
                    --genome=%(genome)s 
                    --log=%(outfile)s.log 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace in ("intergenic", "intronic", "CDS"):
            P.checkParameter("annotator_regions")
            statement = '''
            awk '$3 == "%(workspace)s"' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        elif workspace == "unknown":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "intronic" || $3 == "intergenic" )' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "known":
            P.checkParameter("annotator_regions")
            statement = '''
            awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' 
            < %(annotator_regions)s
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''

        elif workspace == "alignable":
            P.checkParameter("annotator_alignment")
            statement = '''gunzip
            < %(annotator_alignment)s 
            | sort -k10,10 
            | awk '$10 !~ /%(annotator_remove_pattern)s/ \
		{if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \
		printf("\\t(%%i,%%i)", $12,$13); }; \
	    END {printf ("\\n");}'\
	    > %(tmpworkspace)s
            '''

        elif workspace == "gene-territories":
            P.checkParameter("annotator_geneterritories")
            statement = '''
            python %(scriptsdir)s/gff2annotator2tsv.py \
                    --section=workspace \
                    --max-length=0 \
                    --log=%(outfile)s.log \
                    --remove-regex='%(annotator_remove_pattern)s'
            < %(annotator_geneterritories)s > %(tmpworkspace)s
            '''

        elif workspace == "mappable":
            P.checkParameter("annotator_mappability")
            statement = '''
            python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s 
            | python %(scriptsdir)s/gff2annotator2tsv.py 
                    --section=workspace 
                    --max-length=0 
                    --log=%(outfile)s.log 
                    --remove-regex='%(annotator_remove_pattern)s'
            > %(tmpworkspace)s
            '''
        else:
            raise P.PipelineError("unknown workspace '%s'" % workspace)

        P.run()
        tmpworkspaces.append(tmpworkspace)

    return tmpworkspaces, tmpsynonyms