def runGLAM2SCAN(infiles, outfile): '''run glam2scan on all intervals and motifs. ''' to_cluster = True # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles controlfile = dbfile[:-len(".fasta")] + ".controlfasta" if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) if os.path.exists(outfile): os.remove(outfile) for motiffile in motiffiles: of = IOTools.openFile(outfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s ::\n" % motif) of.close() statement = ''' cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s ''' P.run()
def getMinimumMappedReads( infiles ): '''find the minimum number of mapped reads in infiles.''' v = [] for infile in infiles: x = getMappedReads( infile ) if x: v.append( x ) if len(v) == 0: raise P.PipelineError( "could not find mapped reads in files %s" % (str(infiles))) return min(v)
def runRegexMotifSearch(infiles, outfile): '''run a regular expression search on sequences. compute counts. ''' motif = "[AG]G[GT]T[CG]A" reverse_motif = "T[GC]A[CA]C[TC]" controlfile, dbfile = infiles if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) motifs = [] for x in range(0, 15): motifs.append( ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE))) for x in range(0, 15): motifs.append(("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE))) db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs) control_positions = Motifs.countMotifs(IOTools.openFile(controlfile, "r"), motifs) db_counts, control_counts = Motifs.getCounts( db_positions), Motifs.getCounts(control_positions) db_seqcounts, control_seqcounts = Motifs.getOccurances( db_positions), Motifs.getCounts(control_positions) ndb, ncontrol = len(db_positions), len(control_positions) outf = IOTools.openFile(outfile, "w") outf.write( "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n" ) for motif, pattern in motifs: try: fold = float(db_seqcounts[motif]) * ncontrol / ( ndb * control_seqcounts[motif]) except ZeroDivisionError: fold = 0 outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % \ (motif, db_counts[motif], control_counts[motif], db_seqcounts[motif], IOTools.prettyPercent( db_seqcounts[motif], ndb), control_seqcounts[motif], IOTools.prettyPercent( control_seqcounts[motif], ncontrol), fold) )
def buildAnnotatorAnnotations(tmpdir, outfile, annotations=None, bedfiles=None, gfffiles=None, gofile=None): '''write annotations in annotator format. ''' tmpannotations = os.path.join(tmpdir, "annotations") to_cluster = True job_options = "-l mem_free=4000M" if annotations == "architecture": statement = ''' cat %(promotors)s %(annotation)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=annotations-gff --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpannotations)s ''' elif annotations == "go": statement = ''' python %(scriptsdir)s/gff2annotator2tsv.py --section=annotations-go --input-filename-map=<(cut -f 2,4 < %(gofile)s) --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' < %(annotator_geneterritories)s > %(tmpannotations)s ''' elif bedfiles: bedfiles = " ".join(bedfiles) statement = ''' cat %(bedfiles)s | python %(scriptsdir)s/bed2annotator2tsv.py --max-length=0 --merge --section=annotations --log=%(outfile)s.log > %(tmpannotations)s ''' else: raise P.PipelineError("unknown annotations '%s'" % annotations) P.run() return tmpannotations
def buildAnnotatorAnnotations(tmpdir, outfile, annotations=None, bedfiles=None, gofile=None): '''write annotations.''' tmpannotations = os.path.join(tmpdir, "annotations") to_cluster = True job_options = "-l mem_free=4000M" if annotations == "architecture": statement = ''' cat %(promotors)s %(annotation)s |\ python %(scriptsdir)s/gff2annotator.py \ --section=annotations-gff \ --log=%(outfile)s.log \ > %(tmpannotations)s ''' elif annotations == "go": statement = ''' cat %(annotator_geneterritories)s |\ python %(scriptsdir)s/gff2annotator.py \ --section=annotations-go \ --input-filename-map=<(cut -f 2,4 < %(gofile)s) \ --log=%(outfile)s.log \ > %(tmpannotations)s ''' elif bedfiles: bedfiles = " ".join(bedfiles) statement = ''' cat %(bedfiles)s |\ python %(scriptsdir)s/bed2annotator.py \ --max-length=0 \ --merge \ --section=annotations \ --log=%(outfile)s.log \ > %(tmpannotations)s ''' else: raise P.PipelineError("unknown annotations '%s'" % workspace) P.run() return tmpannotations
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def getExpressionMeasurements(track): '''return a tuple (probesets, treatments, controls) where probesets is an array of n probsets and treatments/controls are tuples of n-length arrays.''' control = getExpressionControl(track) if track == control: raise P.PipelineError("track (%s) == control (%s)" % (track, control)) replicates_sample = getExpressionReplicates(track) assert len(replicates_sample) > 0, "no replicates for sample %s" % track replicates_control = getExpressionReplicates(control) assert len( replicates_control) > 0, "no replicates for control %s" % control dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() track_samples = ",".join(["a.%s" % x for x in replicates_sample]) control_samples = ",".join(["b.%s" % x for x in replicates_control]) cc = dbhandle.cursor() statement = """SELECT a.cluster_id, %(track_samples)s, %(control_samples)s FROM %(track)s_levels AS a, %(control)s_levels AS b WHERE a.cluster_id = b.cluster_id""" % locals() cc.execute(statement) r = zip(*cc.fetchall()) nreplicates_sample = len(replicates_sample) treatments = r[1:nreplicates_sample + 1] controls = r[nreplicates_sample + 1:] return (control, r[0], treatments, controls)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' to_cluster = True # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tablename = outfile[:-len(".load")] tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match(":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.iteritems(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() #start, end = int(start), int(end) #match.start += start #match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join( map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ --map=base_qualities:text \ < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def buildAnnotatorWorkSpace(tmpdir, outfile, workspaces=("genomic", ), gc_control=False): '''write genomic workspace.''' to_cluster = True job_options = "-l mem_free=4000M" tmpworkspaces = [] if gc_control: tmpworkspace = os.path.join(tmpdir, "workspace_gc") tmpsynonyms = os.path.join(tmpdir, "synonyms") tmpworkspaces.append(tmpworkspace) statement = ''' awk '{ printf("%%s\\t%%s\\t%%s\\t%%s.%%s\\n", $1,$2,$3,$1,$4)}' < %(annotator_gc)s |\ python %(scriptsdir)s/bed2gff.py |\ python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --output-filename-synonyms=%(tmpsynonyms)s \ --max-length=0 \ --log=%(outfile)s.log \ > %(tmpworkspace)s''' P.run() else: tmpsynonyms = None for workspace in workspaces: tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace) if workspace == "genomic": statement = ''' python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ < %(genome)s.gff > %(tmpworkspace)s ''' elif workspace == "promotors": statement = ''' python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ < %(promotors)s > %(tmpworkspace)s ''' elif workspace == "promotors-go": # promotors with GO categories statement = '''cat < %(promotors)s |\ python %(scriptsdir)s/gtf2gtf.py \ --filter=gene \ --apply=<(cut -f 2 < %(gofile)s | sort | uniq ) |\ python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ > %(tmpworkspace)s ''' elif workspace == "gene-territories": statement = ''' python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ < %(annotator_geneterritories)s > %(tmpworkspace)s ''' elif workspace == "mappable": statement = ''' python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s |\ python %(scriptsdir)s/gff2annotator.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ > %(tmpworkspace)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run(**dict(locals().items() + PARAMS.items())) tmpworkspaces.append(tmpworkspace) return tmpworkspaces, tmpsynonyms
def buildWorkSpace(outfile, workspace): '''write genomic workspace. Available workspaces are: genomic the full genome intronic introns (requires annotator_regions to be set) exonic exonic (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' to_cluster = True job_options = "-l mem_free=4000M" workspace = workspace.lower() if workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2bed.py --genome=%(genome)s --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace in ("intergenic", "intronic", "cds"): P.checkParameter("enrichment_regions") workspace_upper = workspace.upper() statement = ''' gunzip < %(enrichment_regions)s | awk 'BEGIN {printf("track name=%(workspace)s\\n"); } ($3 == "%(workspace)s" || $3 == "%(workspace_upper)s") && !( $1 ~ /%(enrichment_remove_pattern)s/) { printf("%%s\\t%%i\\t%%i\\n", $1, $4-1, $5); }' > %(outfile)s ''' elif workspace == "unknown": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "known": P.checkParameter("enrichment_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(enrichment_regions)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' elif workspace == "alignable": P.checkParameter("enrichment_alignment") statement = '''gunzip < %(enrichment_alignment)s | sort -k10,10 | awk '$10 !~ /%(enrichment_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(outfile)s ''' elif workspace == "gene-territories": P.checkParameter("enrichment_geneterritories") statement = ''' python %(scriptsdir)s/gff2enrichment.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(enrichment_remove_pattern)s' < %(enrichment_geneterritories)s > %(outfile)s ''' elif workspace == "mappable": P.checkParameter("enrichment_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(enrichment_mappability)s | python %(scriptsdir)s/gff2enrichment.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(enrichment_remove_pattern)s' > %(outfile)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run()
def buildAnnotatorWorkSpace(tmpdir, outfile, workspaces=("genomic", ), gc_control=False): '''write genomic workspace. Available workspaces are: genomic the full genome all is ignored intronic introns (requires annotator_regions to be set) intergenic introns (requires annotator_regions to be set) geneterritories introns (requires annotator_geneterritories to be set) mappable mappable part of genome (requires annotator_mappability to be set ) alignable only the alignable part of a genome (requires annotator_alignment to be set) If ``gc_control`` is True, the chromosomes will be divided into isochores (requiers the paramater ``annotator_gc_workspace`` to be set). ''' to_cluster = True job_options = "-l mem_free=4000M" tmpworkspaces = [] if gc_control: P.checkParameter("annotator_gc_workspace") tmpsynonyms = PARAMS["annotator_gc_workspace"] + ".synonyms" tmpworkspaces.append(PARAMS["annotator_gc_workspace"]) else: tmpsynonyms = None for workspace in workspaces: tmpworkspace = os.path.join(tmpdir, "workspace_%s" % workspace) if workspace == "all": continue elif workspace == "genomic": P.checkParameter("genome") statement = ''' python %(scriptsdir)s/index2gff.py --genome=%(genome)s --log=%(outfile)s.log | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace in ("intergenic", "intronic", "CDS"): P.checkParameter("annotator_regions") statement = ''' awk '$3 == "%(workspace)s"' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "unknown": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "intronic" || $3 == "intergenic" )' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "known": P.checkParameter("annotator_regions") statement = ''' awk '($3 == "CDS" || $3 ~ /UTR/ || $3 ~ /flank/)' < %(annotator_regions)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' elif workspace == "alignable": P.checkParameter("annotator_alignment") statement = '''gunzip < %(annotator_alignment)s | sort -k10,10 | awk '$10 !~ /%(annotator_remove_pattern)s/ \ {if ($10!=l) {printf("\\n##Work\\t%%s", $10); l=$10;} \ printf("\\t(%%i,%%i)", $12,$13); }; \ END {printf ("\\n");}'\ > %(tmpworkspace)s ''' elif workspace == "gene-territories": P.checkParameter("annotator_geneterritories") statement = ''' python %(scriptsdir)s/gff2annotator2tsv.py \ --section=workspace \ --max-length=0 \ --log=%(outfile)s.log \ --remove-regex='%(annotator_remove_pattern)s' < %(annotator_geneterritories)s > %(tmpworkspace)s ''' elif workspace == "mappable": P.checkParameter("annotator_mappability") statement = ''' python %(scriptsdir)s/bed2gff.py < %(annotator_mappability)s | python %(scriptsdir)s/gff2annotator2tsv.py --section=workspace --max-length=0 --log=%(outfile)s.log --remove-regex='%(annotator_remove_pattern)s' > %(tmpworkspace)s ''' else: raise P.PipelineError("unknown workspace '%s'" % workspace) P.run() tmpworkspaces.append(tmpworkspace) return tmpworkspaces, tmpsynonyms