def loadDESeqNormalize(infile, outfile): P.load(infile, outfile, transpose=True)
def update_report(): '''update report.''' E.info("updating documentation") P.run_report(clean=False)
def loadGeneSetStats(infile, outfile): ''' load stats on coding and lncRNA gene sets ''' P.load(infile, outfile)
def publish_report(): '''publish report in the CGAT downloads directory.''' E.info("publishing report") P.publish_report()
def loadControlCPCResults(infile, outfile): P.load(infile, outfile, options="--header-names=transcript_id,feature,C_NC,CP_score " "--add-index=transcript_id")
def getRepeatsFromUCSC(dbhandle, repclasses, outfile, remove_contigs_regex=None): '''download repeats from UCSC database and write to `outfile` in :term:`gff` format. This method downloads repeats from the repeatmasker track at the UCSC. Arguments --------- dbhandle : object Database handle to UCSC mysql database repclasses : list List of repeat classes to select. If empty, all repeat classes will be collected. outfile : string Filename of output file in :term:`gff` format. remove_contigs_regex : list If given, remove repeats on contigs matching the regular expression given. ''' # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") # now collect repeats tmpfile = P.get_temp_file(".") for table in tables: sql = """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\"; repName \\"', repName, '\\";') FROM %(table)s""" if repclasses: repclasses_str = ",".join( ["'" + x.strip() + "'" for x in repclasses]) sql += ''' WHERE repClass in (%(repclasses_str)s) ''' % locals() sql = sql % locals() E.debug("executing sql statement: %s" % sql) cc = dbhandle.execute(sql) for data in cc.fetchall(): tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() # sort gff and make sure that names are correct tmpfilename = tmpfile.name statement = [ '''cat %(tmpfilename)s | sort -t$'\\t' -k1,1 -k4,4n | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log ''' ] if remove_contigs_regex: statement.append('--contig-pattern="{}"'.format( ",".join(remove_contigs_regex))) statement.append('| gzip > %(outfile)s') statement = " ".join(statement) P.run(statement) os.unlink(tmpfilename)
import os import gzip from ruffus import * from CGATCore import Pipeline as P import CGATPipelines.PipelineTracks as PipelineTracks import CGATCore.IOTools as IOTools ################################################### ################################################### ################################################### # Pipeline configuration ################################################### # load options from the config file P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS ################################################################### ################################################################### ################################################################### ## ################################################################### if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) PARAMS = P.getParameters() ###################################################################
'pipeline_docs', 'themes') logopath = os.path.join(themedir, "cgat_logo.png") ################################################################ # Import pipeline configuration from pipeline.ini in the current # directory and the common one. # PATH were code for pipelines is stored pipelinesdir = os.path.dirname(CGATPipelines.__file__) # The default configuration file - 'inifile' is read by # sphinx-report. inifile = os.path.join(os.path.dirname(CGATPipelines.__file__), 'configuration', 'pipeline.yml') PARAMS = P.get_parameters([inifile, "pipeline.yml"]) # Definition now part of CGATReport # def setup(app): # app.add_config_value('PARAMS', {}, True) ################################################################ ################################################################ ################################################################ # The pipeline assumes that sphinxreport is called within the # working directory. If the report is in a separate build directory, # change the paths below. # # directory with export directory from pipeline # This should be a directory in the build directory - you can # link from here to a directory outside the build tree, though.
import CGATCore.Experiment as E import CGATCore.IOTools as IOTools import CGATPipelines.PipelineMotifs as PipelineMotifs import CGATPipelines.PipelineTracks as PipelineTracks from CGATPipelines.Report import run_report ################################################### ################################################### ################################################### # Pipeline configuration ################################################### from CGATCore import Pipeline as P P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ], defaults={'annotations_dir': ""}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peek_parameters(PARAMS["annotations_dir"], "genesets") ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample
def sortByPosition(infile, outfile): '''Add number of hits tags to sam file''' to_cluster = USECLUSTER track = P.snip(outfile, ".bam") statement = '''samtools sort %(infile)s %(track)s;''' P.run()
import sys import os import shutil import sqlite3 import subprocess import glob from CGATCore import Experiment as E import CGAT.Sra as Sra from CGATCore import Pipeline as P import CGATPipelines.PipelineRnaseq as RnaSeq import tempfile from CGATPipelines.Report import run_report # load options from the config file PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) # add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update( P.peek_parameters(PARAMS["annotations_dir"], 'genesets', prefix="annotations_", update_interface=True, restrict_interface=True)) PARAMS["project_src"] = os.path.dirname(__file__)
def loadFilteredData(infile, outfile): P.load(infile, outfile)
def loadTimePointDiffExpression(infile, outfile): P.load(infile, outfile)
def loadConditionDiffExpression(infile, outfile): P.load(infile, outfile)
def loadRepeats(infile, outfile): '''load repeat overlap''' P.load(infile, outfile, "--add-index=gene_id --map=gene_id:str")
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .yml file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()]))) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): offsets.extend(list(map(int, value.split(",")))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError("number of BAM files %s is not the " "same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write( "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write( "\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def indexIntervals(infile, outfile): '''index intervals. ''' statement = '''zcat %(infile)s | sort -k1,1 -k2,2n | bgzip > %(outfile)s; tabix -p bed %(outfile)s''' P.run(statement)
P.run(statement) elif (os.path.exists(report_dir) and os.path.isdir(report_dir) and os.listdir(report_dir)): sys.exit(''' {} exists, not overwriting. You can manually run: make html ; ln -sf _build/html/report_pipeline_pq_example.html . ; make latexpdf ; ln -sf _build/latex/pq_example.pdf . Or delete the folder and re-run make_report '''.format(report_dir)) else: sys.exit(''' The directory "pipeline_report" does not exist. Are the paths correct? Template files were tried to be copied from: {} You can also manually copy files and run "make html" or "make latexpdf". '''.format(report_path)) return ################ ################ if __name__ == "__main__": sys.exit(P.main(sys.argv)) ################
def buildMemeBackgroundFiles(infile, outfile): '''prepare the meme background model''' statement = '''fasta-get-markov -m 2 %(infile)s > %(outfile)s''' % locals( ) P.run(statement)
Code ==== """ from ruffus import * import sys import os import sqlite3 import CGATCore.Experiment as E from CGATCore import Pipeline as P import re # load options from the config file PARAMS = P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS["projectsrc"] = os.path.dirname(__file__) #for key, value in PARAMS.iteritems(): # print "%s:\t%s" % (key,value) # add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update( P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__",
import re import glob import os import gzip from ruffus import * import sqlite3 import CGATCore.Experiment as E from CGATCore import Pipeline as P import CGATCore.IOTools as IOTools import CGATCore.Database as Database import CGAT.GTF as GTF import CGATPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini"]) PARAMS = P.PARAMS USECLUSTER = True # link up with annotations PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py") # link up with ancestral repeats PARAMS_ANCESTRAL_REPEATS = P.peekParameters( PARAMS["ancestral_repeats_dir"], "pipeline_ancestral_repeats.py")
import CGATCore.Experiment as E from CGATCore import Pipeline as P import CGAT.GTF as GTF import CGATCore.IOTools as IOTools import CGATPipelines.PipelineLncRNA as PipelineLncRNA ################################################### # Pipeline configuration ################################################### P.getParameters( [ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ], defaults={ "annotations_dir": "", "genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": "" }) PARAMS = P.PARAMS PARAMS.update( P.peekParameters(PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True))
def loadSummary(infile, outfile): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists("%s_rates.load" % track): stmt_select.append("a.distance AS ks, a.aligned AS aligned") stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''') if os.path.exists("%s_coverage.load" % track): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage") stmt_from.append( "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id") if os.path.exists("%s_repeats_gc.load" % track): stmt_select.append("ar_gc.exons_mean AS repeats_gc") stmt_from.append( "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id") if os.path.exists("%s_repeats_rates.load" % track): stmt_select.append( "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska") stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''') if os.path.exists("%s_introns_rates.load" % track): stmt_select.append( "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski") stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''') x = locals() x.update(PARAMS) stmt_select = ", ".join(stmt_select) % x stmt_from = " ".join(stmt_from) % x stmt_where = " AND ".join(stmt_where) % x dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals()) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait(dbhandle, statement) dbhandle.commit() P.touch(outfile)
def build_report(): '''build report from scratch.''' E.info("starting documentation build process from scratch") P.run_report(clean=True)
def loadOverrun(infile, outfile): '''load annotations''' P.load(infile, outfile, "--add-index=gene_id --map=gene_id:str")
def main(argv=None): if argv is None: argv = sys.argv P.main(argv)
def loadDistances(infile, outfile): '''load annotations''' P.load(infile, outfile, "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str") table = outfile[:-len(".load")]
def loadLncRNAPhyloCSF(infile, outfile): tmpf = P.getTempFilename("/ifs/scratch") PipelineLncRNA.parsePhyloCSF(infile, tmpf) P.load(tmpf, outfile, options="--add-index=gene_id")
def loadCombinedExpression(infile, outfile): P.load(infile, outfile)