def orthologTripleWithFeature( infile, outfile): '''Generate list of conserved genes associated with feature in all species ''' tablename = "ortholog_triple_with_feature" #anno_base = PARAMS["annotations_dir"] species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) #db_name = PARAMS["database"] species_lookup = dict(zip(species_list, anno_list)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect( PARAMS["database"] ) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals() cc.execute( statement ) cc.close() # Extract data from db cc = dbhandle.cursor() cc.execute( "DROP TABLE IF EXISTS %(tablename)s" % locals() ) statement = '''CREATE TABLE %(tablename)s AS SELECT count(distinct o.schema) as species_count, group_concat(o.gene_id,",") as gene_ids, group_concat(g.gene_name,",") as gene_names, group_concat(o.schema,",") as species_list, set_id FROM genelists_merged g, triple_ortholog_groups o WHERE g.gene_id=o.gene_id GROUP BY set_id ''' % locals() cc.execute( statement ) cc.close() statement = "touch %s" % outfile P.run()
def orthologTripleWithFeature(infile, outfile): '''Generate list of conserved genes associated with feature in all species ''' tablename = "ortholog_triple_with_feature" #anno_base = PARAMS["annotations_dir"] species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) #db_name = PARAMS["database"] species_lookup = dict(zip(species_list, anno_list)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect(PARAMS["database"]) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals( ) cc.execute(statement) cc.close() # Extract data from db cc = dbhandle.cursor() cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals()) statement = '''CREATE TABLE %(tablename)s AS SELECT count(distinct o.schema) as species_count, group_concat(o.gene_id,",") as gene_ids, group_concat(g.gene_name,",") as gene_names, group_concat(o.schema,",") as species_list, set_id FROM genelists_merged g, triple_ortholog_groups o WHERE g.gene_id=o.gene_id GROUP BY set_id ''' % locals() cc.execute(statement) cc.close() statement = "touch %s" % outfile P.run()
def exportConservedGeneBed(infile, outfile): '''export bed file for each list of conserved CAPseq genes''' species_list = P.asList(PARAMS["species"]) gtf_list = P.asList(PARAMS["annotations_gtf"]) species_lookup = dict(zip(species_list, gtf_list)) species = infile[0:2] species_gtf = species_lookup[species] track = P.snip(os.path.basename(infile), ".export") #gtffile = os.path.join( PARAMS["annotations_dir"], species_genome, PARAMS["annotations_gtf"] ) statement = '''zcat %(species_gtf)s | python %(scriptsdir)s/gtf2gtf.py --filter=gene --apply=%(infile)s --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --with-utr --log=%(outfile)s.log | python %(scriptsdir)s/gff2bed.py --is-gtf --name=gene_id --track=feature --log=%(outfile)s.log | grep -v track > %(outfile)s;''' P.run()
def exportConservedGeneBed( infile, outfile ): '''export bed file for each list of conserved CAPseq genes''' species_list = P.asList(PARAMS["species"]) gtf_list = P.asList(PARAMS["annotations_gtf"]) species_lookup = dict(zip(species_list, gtf_list)) species = infile[0:2] species_gtf = species_lookup[species] track = P.snip( os.path.basename(infile),".export") #gtffile = os.path.join( PARAMS["annotations_dir"], species_genome, PARAMS["annotations_gtf"] ) statement = '''zcat %(species_gtf)s | python %(scriptsdir)s/gtf2gtf.py --filter=gene --apply=%(infile)s --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --with-utr --log=%(outfile)s.log | python %(scriptsdir)s/gff2bed.py --is-gtf --name=gene_id --track=feature --log=%(outfile)s.log | grep -v track > %(outfile)s;''' P.run()
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(scriptsdir)s/cgat_file_apply.sh {} md5sum \; | perl -p -e "s/ +/\\t/g" | sort -k1,1 > %(outfile)s''' P.run()
def exportPairsScoreMatrix2( infile, outfile ): species_list = P.asList(PARAMS["species"]) outs = open( outfile, "w") first=True for species in species_list: dbhandle = sqlite3.connect( PARAMS["database"] ) cc = dbhandle.cursor() statement = ''' SELECT species, score from ( SELECT species2 as species, score2 as score from pairwise_ortholog_stats where species1="%(species)s" UNION SELECT species1 as species, score2 as score from pairwise_ortholog_stats where species2="%(species)s" UNION SELECT "%(species)s" as species, 1.0 as score) ORDER BY species desc''' % locals() # If first write headers if first: cc.execute( statement ) outs.write("species") for result in cc: outs.write("\t%s" % result[0] ) outs.write("\n") first = False cc.execute( statement ) outs.write(species) for result in cc: outs.write("\t%s" % result[1] ) outs.write("\n") cc.close() outs.close()
def exportConservedGeneListPerSpecies(infile, outfile): '''Export list of conserved genes associated with feature for each species ''' species_list = P.asList(PARAMS["species"]) ensembl_version = PARAMS["orthology_ensembl_version"] # Get gene list from database dbhandle = sqlite3.connect(PARAMS["database"]) for species in species_list: cc = dbhandle.cursor() statement = '''SELECT distinct g.gene_id FROM ortholog_groups g, ortholog_groups_with_feature f WHERE f.set_id=g.set_id AND f.species_count=6 AND g.schema LIKE "cgat_%(species)s%%"''' % locals() cc.execute(statement) # Write to file outfilename = species + ".conserved.export" outs = open(outfilename, "w") for result in cc: pre = "" for r in result: outs.write("%s%s" % (pre, str(r))) pre = "\t" outs.write("\n") cc.close() outs.close() statement = "touch %s" % outfile P.run()
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError( "required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append( '''gunzip < %(filename)s''' % locals() ) else: statement.append( ''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals() ) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def annotateGenesetOverlap(infile, outfile): '''classify intervals according to their base pair overlap with respect to different genomic features (genes, TSS, upstream/downstream flanks) ''' to_cluster = True feature_list = P.asList(PARAMS["geneset_feature_list"]) outfiles = "" first = True for feature in feature_list: feature_name = P.snip(os.path.basename(feature), ".gtf").replace(".", "_") outfiles += " %(outfile)s.%(feature_name)s " % locals() if first: cut_command = "cut -f1,4,5,6,8 " first = False else: cut_command = "cut -f4,5,6 " statement = """ cat %(infile)s | python %(scriptsdir)s/bed2gff.py --as-gtf | python %(scriptsdir)s/gtf2table.py --counter=overlap --counter=length --log=%(outfile)s.log --filename-gff=%(geneset_dir)s/%(feature)s --genome-file=%(genome_dir)s/%(genome)s | %(cut_command)s | sed s/nover/%(feature_name)s_nover/g | sed s/pover/%(feature_name)s_pover/g | sed s/min/length/ > %(outfile)s.%(feature_name)s""" P.run() # Paste output together statement = '''paste %(outfiles)s > %(outfile)s''' P.run()
def annotateGenesetOverlap( infile, outfile ): '''classify intervals according to their base pair overlap with respect to different genomic features (genes, TSS, upstream/downstream flanks) ''' to_cluster = True feature_list = P.asList( PARAMS["geneset_feature_list"] ) outfiles = "" first = True for feature in feature_list: feature_name = P.snip( os.path.basename( feature ), ".gtf" ).replace(".","_") outfiles += " %(outfile)s.%(feature_name)s " % locals() if first: cut_command = "cut -f1,4,5,6,8 " first = False else: cut_command = "cut -f4,5,6 " statement = """ cat %(infile)s | python %(scriptsdir)s/bed2gff.py --as-gtf | python %(scriptsdir)s/gtf2table.py --counter=overlap --counter=length --log=%(outfile)s.log --filename-gff=%(geneset_dir)s/%(feature)s --genome-file=%(genome_dir)s/%(genome)s | %(cut_command)s | sed s/nover/%(feature_name)s_nover/g | sed s/pover/%(feature_name)s_pover/g | sed s/min/length/ > %(outfile)s.%(feature_name)s""" P.run() # Paste output together statement = '''paste %(outfiles)s > %(outfile)s''' P.run()
def findGenes(infile, outfile): '''Adds expression "GENE_OF_INTEREST" to the FILTER column of the vcf if variant is within a gene of interest as defined in the ini file''' to_cluster = USECLUSTER geneList = P.asList( PARAMS["annotation_genes_of_interest"] ) expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList) statement = '''GenomeAnalysisTK -T VariantFiltration -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'" --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals() P.run()
def exportConservedGeneListPerSpecies( infile, outfile): '''Export list of conserved genes associated with feature for each species ''' species_list = P.asList(PARAMS["species"]) ensembl_version = PARAMS["orthology_ensembl_version"] # Get gene list from database dbhandle = sqlite3.connect( PARAMS["database"] ) for species in species_list: cc = dbhandle.cursor() statement = '''SELECT distinct g.gene_id FROM ortholog_groups g, ortholog_groups_with_feature f WHERE f.set_id=g.set_id AND f.species_count=6 AND g.schema LIKE "cgat_%(species)s%%"''' % locals() cc.execute( statement ) # Write to file outfilename = species + ".conserved.export" outs = open( outfilename, "w") for result in cc: pre = "" for r in result: outs.write("%s%s" % (pre, str(r)) ) pre = "\t" outs.write("\n") cc.close() outs.close() statement = "touch %s" % outfile P.run()
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(scriptsdir)s/cgat_file_apply.sh {} wc -l \; | sort -k1,1 > %(outfile)s''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError("required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append('''gunzip < %(filename)s''' % locals()) else: statement.append(''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals()) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def exportMotifDiscoverySequences( infile, outfile ): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip( infile, "_intervals.load" ) dbhandle = connect() p = P.substituteParameters( **locals() ) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full = False, masker = P.asList(p['motifs_masker']), halfwidth = int(p["motifs_halfwidth"]), maxsize = int(p["motifs_max_size"]), proportion = p["motifs_proportion"], min_sequences = p["motifs_min_sequences"], num_sequences = p["motifs_num_sequences"], order = p['motifs_score']) if nseq == 0: E.warn( "%s: no sequences - meme skipped" % outfile) P.touch( outfile )
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substituteParameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.asList(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile)
def exportPairsScoreMatrix2(infile, outfile): species_list = P.asList(PARAMS["species"]) outs = open(outfile, "w") first = True for species in species_list: dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = ''' SELECT species, score from ( SELECT species2 as species, score2 as score from pairwise_ortholog_stats where species1="%(species)s" UNION SELECT species1 as species, score2 as score from pairwise_ortholog_stats where species2="%(species)s" UNION SELECT "%(species)s" as species, 1.0 as score) ORDER BY species desc''' % locals() # If first write headers if first: cc.execute(statement) outs.write("species") for result in cc: outs.write("\t%s" % result[0]) outs.write("\n") first = False cc.execute(statement) outs.write(species) for result in cc: outs.write("\t%s" % result[1]) outs.write("\n") cc.close() outs.close()
def findGenes(infile, outfile): '''Adds expression "GENE_OF_INTEREST" to the FILTER column of the vcf if variant is within a gene of interest as defined in the ini file''' to_cluster = USECLUSTER geneList = P.asList(PARAMS["annotation_genes_of_interest"]) expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList) statement = '''GenomeAnalysisTK -T VariantFiltration -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'" --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals( ) P.run()
def mergeGeneLists(infiles, outfile): '''Merge gene lists into single table and load into SQLite.''' tablename = P.toTable(outfile) species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) species_lookup = dict(zip(species_list, anno_list)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect(PARAMS["database"]) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals( ) print statement cc.execute(statement) cc.close() # Build union statement pre = "CREATE TABLE %s AS " % tablename statement = "" for f in infiles: track = P.snip(os.path.basename(f), ".genelist.load").replace("-", "_").replace(".", "_") species = track[:2] genelist_id = PARAMS["genelist_id"] statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species FROM %(track)s_genelist g, %(species)s.transcript_info t WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals( ) pre = " UNION " print statement cc = dbhandle.cursor() cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals()) cc.execute(statement) cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' % tablename) cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' % tablename) cc.close() statement = "touch %s" % outfile P.run()
def mergeGeneLists( infiles, outfile ): '''Merge gene lists into single table and load into SQLite.''' tablename = P.toTable( outfile ) species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) species_lookup = dict(zip(species_list, anno_list)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect( PARAMS["database"] ) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals() print statement cc.execute( statement ) cc.close() # Build union statement pre = "CREATE TABLE %s AS " % tablename statement = "" for f in infiles: track = P.snip( os.path.basename( f), ".genelist.load" ).replace("-","_").replace(".","_") species = track[:2] genelist_id=PARAMS["genelist_id"] statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species FROM %(track)s_genelist g, %(species)s.transcript_info t WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals() pre = " UNION " print statement cc = dbhandle.cursor() cc.execute( "DROP TABLE IF EXISTS %(tablename)s" % locals() ) cc.execute( statement ) cc.execute( '''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' % tablename ) cc.execute( '''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' % tablename ) cc.close() statement = "touch %s" % outfile P.run()
def buildAssemblyStats(infile, outfile): ''' return assembly stats from all of the assemblers that were used in the running of the contig assembly ''' assemblers = P.asList(PARAMS.get("assemblers")) # connect dbh = sqlite3.connect(infile) cc = dbh.cursor() result = {} alignment_stats_names = [] for assembler in assemblers: tablename = "%s_contig_summary_tsv" % assembler # get the contig summaries for data in cc.execute("""SELECT track, nscaffolds, median_length, mean_length, max_length, scaffold_length, N50 FROM %s""" % tablename).fetchall(): track = "%s_" % assembler + data[0] result[track] = list(data[1:]) alignment_stats_names.append(track + "_alignment_stats") # get the alignment statistics - % of reads # mapping to contigs for a in alignment_stats_names: a = P.toTable(a + ".load") for data in cc.execute( """SELECT percent FROM %s WHERE category == 'reads_mapped'""" % a).fetchall(): track = a[:-len("_alignment_stats")].replace( "_filtered_contigs", ".filtered.contigs").replace("sim_", "sim-").replace("BP_", "BP-") result[track].append(data[0]) outf = open(outfile, "w") outf.write( "assembler\ttrack\tncontigs\tmedian_length\tmean_length\tmax_length\ttotal_length\tN50\tpercent_mapped\n" ) for track, results in result.iteritems(): assembler = track.split("_")[0] track = track.split("_")[1].replace("-R1.filtered.contigs", "") outf.write("\t".join([assembler, track] + map(str, results)) + "\n") outf.close()
def getInput( track ): '''return a list of input tracks associated with track. Associations can be defined in the .ini file in the section [input]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [input] track1.bam=input1.bam,input2.bam Glob expressions are permitted. Default tracks can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' input_files = [] # configparser by default converts option names to lower case fn = track.asFile() fn = fn.lower() if "input_%s" % fn in PARAMS: input_files.extend( P.asList( PARAMS["input_%s" % fn ] ) ) else: for pattern, value in P.CONFIG.items( "input" ): if "%" in pattern: p = re.sub( "%", "\S+", pattern ) if re.search( p, fn ): input_files.extend( P.asList( value ) ) return input_files
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def copyEnsemblDb( infile, outfile ): '''copy tables from ensembl database to rnaseq database''' table_list = P.asList(PARAMS["ensembl_tables"]) dbhandle = sqlite3.connect( PARAMS["database"] ) cc = dbhandle.cursor() query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"] cc.execute( query ) for table in table_list: cc = dbhandle.cursor() query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table, table) print query cc.execute( query ) cc.close() statement = """touch %(outfile)s;""" P.run()
def copyEnsemblDb(infile, outfile): """copy tables from ensembl database to rnaseq database""" table_list = P.asList(PARAMS["ensembl_tables"]) dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"] cc.execute(query) for table in table_list: cc = dbhandle.cursor() query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table, table) print query cc.execute(query) cc.close() statement = """touch %(outfile)s;""" P.run()
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def buildAssemblyStats(infile, outfile): ''' return assembly stats from all of the assemblers that were used in the running of the contig assembly ''' assemblers = P.asList(PARAMS.get("assemblers")) # connect dbh = sqlite3.connect(infile) cc = dbh.cursor() result = {} alignment_stats_names = [] for assembler in assemblers: tablename = "%s_contig_summary_tsv" % assembler # get the contig summaries for data in cc.execute("""SELECT track, nscaffolds, median_length, mean_length, max_length, scaffold_length, N50 FROM %s""" % tablename).fetchall(): track = "%s_" % assembler+data[0] result[track] = list(data[1:]) alignment_stats_names.append(track+"_alignment_stats") # get the alignment statistics - % of reads # mapping to contigs for a in alignment_stats_names: a = P.toTable(a+".load") for data in cc.execute("""SELECT percent FROM %s WHERE category == 'reads_mapped'""" % a).fetchall(): track = a[:-len("_alignment_stats")].replace("_filtered_contigs", ".filtered.contigs").replace("sim_", "sim-").replace("BP_", "BP-") result[track].append(data[0]) outf = open(outfile, "w") outf.write("assembler\ttrack\tncontigs\tmedian_length\tmean_length\tmax_length\ttotal_length\tN50\tpercent_mapped\n") for track, results in result.iteritems(): assembler = track.split("_")[0] track = track.split("_")[1].replace("-R1.filtered.contigs", "") outf.write("\t".join([assembler, track] + map(str,results)) + "\n") outf.close()
def runTest(infile, outfile): '''run a test.''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get("%s_pipeline" % track, "pipeline_" + track[len("test_"):]) pipeline_targets = ' '.join( P.asList(PARAMS.get("%s_target" % track, "full"))) # do not run on cluster, mirror # that a pipeline is started from # the head node to_cluster = False statement = ''' (cd %(track)s.dir; python %(pipelinedir)s/%(pipeline_name)s.py %(pipeline_options)s make %(pipeline_targets)s) >& %(outfile)s ''' P.run()
@transform(buildEdgeRStats, suffix(".tsv"), ".load") def loadEdgeRStats(infile, outfile): P.load(infile, outfile) @follows(loadCufflinks, loadCufflinksFPKM, loadGeneLevelReadCounts) def expression(): pass mapToTargets = { 'cuffdiff': loadCuffdiffStats, 'deseq': loadDESeqStats, 'edger': loadEdgeRStats, } TARGETS_DIFFEXPRESSION = [mapToTargets[x] for x in P.asList(PARAMS["methods"])] @follows(*TARGETS_DIFFEXPRESSION) def diff_expression(): pass @follows(diff_expression) @merge("*_stats.tsv", "de_stats.load") def loadDEStats(infiles, outfile): '''load DE stats into table.''' P.concatenateAndLoad(infiles, outfile, missing_value=0, regex_filename="(.*)_stats.tsv")
def MergedGeneListStats(infile, outfile): species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) species_lookup = dict(zip(species_list, anno_list)) # Write to file header = "species\tgenes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature" outs = open(outfile, "w") outs.write("%s\n" % (header)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect(PARAMS["database"]) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals( ) print statement cc.execute(statement) cc.close() # Extract data from db cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM genelists_merged g, %(species)s.transcript_info t WHERE g.gene_id=t.gene_id AND t.gene_biotype='protein_coding' ''' % locals() cc.execute(statement) result = cc.fetchall() genes_with_feature = str(result[0][0]) cc.close() #print track + " genes_with_feature=" + genes_with_feature + "\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct gene_id) as genes FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals( ) cc.execute(statement) result = cc.fetchall() total_genes = str(result[0][0]) cc.close() #print track + " total_protein_coding_genes =" + total_genes + "\n" proportion_with_feature = (float(genes_with_feature) / float(total_genes)) * 100 #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct set_id) as genes FROM ortholog_groups''' % locals() cc.execute(statement) result = cc.fetchall() total_conserved_genes = str(result[0][0]) cc.close() #print "total_conserved_genes =" + total_conserved_genes + "\n" proportion_conserved = (float(total_conserved_genes) / float(total_genes)) * 100 #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM genelists_merged g, %(species)s.transcript_info t, ortholog_groups o WHERE g.gene_id=t.gene_id and t.gene_biotype='protein_coding' AND o.gene_id=t.gene_id''' % locals() cc.execute(statement) result = cc.fetchall() conserved_genes_with_feature = str(result[0][0]) cc.close() #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n" proportion_conserved_with_feature = ( float(conserved_genes_with_feature) / float(total_conserved_genes)) * 100 #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n" outs.write( "%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (species, genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature)) outs.close()
| %(cmd-farm)s --split-at-regex="^chain" --chunksize=1000 --max-lines=1000000 --log=%(outfile)s.log " python %(scriptsdir)s/chain2psl.py --log=%(outfile)s.log | pslSwap stdin stdout " | gzip > %(outfile)s ''' P.run() ########################################################################## ########################################################################## ########################################################################## # extracting alignments from maf files ########################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
statement = ''' (cd %(test_name)s.dir; python %(pipelines_dir)s/%(pipeline_name)s.py %(pipeline_options)s make full) >& %(outfile)s ''' P.run() ################################################################### ################################################################### ################################################################### # general tests ################################################################### @files([(os.path.join(PARAMS["data_dir"], x + ".dir"), x + ".log") for x in P.asList(PARAMS["prerequisites"])]) def runPreparationTests(infile, outfile): '''run pre-requisite pipelines.''' runTest(infile, outfile) ################################################################### ################################################################### ################################################################### # run a test ################################################################### @follows(runPreparationTests) @files([(x, os.path.basename(P.snip(x, '.dir')) + ".log") for x in glob.glob( os.path.join(PARAMS["data_dir"], "pipeline_*.dir"))
P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### # do not run in parallel. run_weka.pl creates a $testfile # that is not unique. run_weka.pl and pph2arff.pl could either # be patched or the following jobs run in sequence. @jobs_limit(1) @files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x) for x in P.asList(PARAMS["polyphen_models"])]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = True # options # -f: feature set, default is F11 # -c: classifier, default is NBd (Naive Bayes with discretization) # -l: model name, default is HumDiv statement = ''' %(polyphen_home)s/bin/run_weka.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model %(infile)s
"pipeline.ini" ] ) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py" ) ################################################################### ################################################################### ################################################################### ## Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 METHODS = P.asList( PARAMS["methods" ] ) ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info( "reading additional configuration from pipeline_conf.py" ) execfile("pipeline_conf.py") ################################################################### ################################################################### ################################################################### def connect(): '''connect to database.
| gzip > %(outfile)s.log.gz''' P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; | gzip > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### @files( [ (x, "%s_%s.output.gz" % (x[:-len(".features.gz")],y), y ) \ for x,y in itertools.product( glob.glob( "*.features.gz"), P.asList( PARAMS["polyphen_models"] ) ) ] ) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = False # need to run in chunks for large feature files statement = """gunzip < %(infile)s | %(cmd-farm)s --split-at-lines=10000 --output-header "perl %(polyphen_home)s/bin/run_weka_cpp.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model -p
def MergedGeneListStats( infile, outfile ): species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) species_lookup = dict(zip(species_list, anno_list)) # Write to file header = "species\tgenes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature" outs = open( outfile, "w") outs.write( "%s\n" % (header) ) # Connect to database and attach annotation databases dbhandle = sqlite3.connect( PARAMS["database"] ) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals() print statement cc.execute( statement ) cc.close() # Extract data from db cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM genelists_merged g, %(species)s.transcript_info t WHERE g.gene_id=t.gene_id AND t.gene_biotype='protein_coding' ''' % locals() cc.execute( statement ) result = cc.fetchall() genes_with_feature = str(result[0][0]) cc.close() #print track + " genes_with_feature=" + genes_with_feature + "\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct gene_id) as genes FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals() cc.execute( statement ) result = cc.fetchall() total_genes = str(result[0][0]) cc.close() #print track + " total_protein_coding_genes =" + total_genes + "\n" proportion_with_feature = (float(genes_with_feature)/float(total_genes))*100 #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct set_id) as genes FROM ortholog_groups''' % locals() cc.execute( statement ) result = cc.fetchall() total_conserved_genes = str(result[0][0]) cc.close() #print "total_conserved_genes =" + total_conserved_genes + "\n" proportion_conserved = (float(total_conserved_genes)/float(total_genes))*100 #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM genelists_merged g, %(species)s.transcript_info t, ortholog_groups o WHERE g.gene_id=t.gene_id and t.gene_biotype='protein_coding' AND o.gene_id=t.gene_id''' % locals() cc.execute( statement ) result = cc.fetchall() conserved_genes_with_feature = str(result[0][0]) cc.close() #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n" proportion_conserved_with_feature = (float(conserved_genes_with_feature)/float(total_conserved_genes))*100 #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n" outs.write( "%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (species, genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature) ) outs.close()
--log=%(outfile)s.log --fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### ######################################################################### ######################################################################### @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
> %(outfile)s.log.gz''' P.run() statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; | gzip > %(outfile)s''' P.run() ################################################################### ################################################################### ################################################################### @files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y) for x, y in itertools.product( glob.glob("*.features.gz"), P.asList(PARAMS["polyphen_models"]))]) def runPolyphen(infile, outfile, model): '''run POLYPHEN on feature tables to classify SNPs. ''' to_cluster = False # need to run in chunks for large feature files statement = """gunzip < %(infile)s | %(cmd-farm)s --split-at-lines=10000 --output-header "perl %(polyphen_home)s/bin/run_weka_cpp.pl -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model -p
def getAssociatedBAMFiles( track ): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .ini file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob( "%s.bam" % fn ) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.asList( PARAMS["bams_%s" % fn.lower() ] ): bamfiles.extend( glob.glob( ff ) ) else: for pattern, value in P.CONFIG.items( "bams" ): if "%" in pattern: p = re.sub( "%", "\S+", pattern ) if re.search( p, fn, re.IGNORECASE ): bamfiles.extend( glob.glob( value ) ) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = map(int, P.asList( PARAMS["offsets_%s" % fn.lower() ] )) else: for pattern, value in P.CONFIG.items( "offsets" ): if "%" in pattern: p = re.sub( "%", "\S+", pattern ) if re.search( p, fn, re.IGNORECASE ): offsets.extend( map( int, value.split(",") ) ) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError("number of BAM files %s is not the same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( glob.glob("*.fastq.gz"), "(\S+).fastq.gz") +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( glob.glob("*.fastq.1.gz"), "(\S+).fastq.1.gz") ALL = PipelineTracks.Sample3() EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) ################################################################### # Global flags ################################################################### # AH: added default values for assemblers and coverage_mapper # to allow import of pipeline script ASSEMBLERS = P.asList(PARAMS.get("assemblers", "")) MAPPER = PARAMS.get("coverage_mapper", 'bwa') BOWTIE = MAPPER == "bowtie" BOWTIE2 = MAPPER == "bowtie2" BWA = MAPPER == "bwa" def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database"]) return dbh ###################################################################
> %(outfile)s ''' P.run() ################################################################################## ################################################################################## ################################################################################## ## extracting alignments from maf files ################################################################################## if "maf_dir" in PARAMS and "maf_tracks" in PARAMS: @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" % (PARAMS["%s_label" % track], PARAMS["maf_master"]), track) for track in P.asList(PARAMS["maf_tracks"])]) def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile)
defaults={ 'annotations_dir': "", 'paired_end': False}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__") ################################################################### ################################################################### ################################################################### # get options that are to be tested cufflinks_options = {} if "cufflinks_test_options" in PARAMS: options = P.asList(PARAMS["cufflinks_test_options"]) for option in options: if option == "--pre-mrna-fraction" \ or option == "--small-anchor-fraction" \ or option == "--max-multiread-fraction": cufflinks_options[option] = [0, 0.5, 0.75, 1] elif option == "--min-isoform-fraction": cufflinks_options[option] = [0.05, 0.1, 0.5, 1] elif option == "--junc-alpha": cufflinks_options[option] = [0.001, 0.01, 0.1] elif option == "--min-frags-per-transfrag": cufflinks_options[option] = [1, 5, 10] elif option == "--overhang-tolerance": cufflinks_options[option] = [0, 2, 5, 8] elif option == "--overlap-radius": cufflinks_options[option] = [50, 100, 200]
statement = ''' gunzip < %(repeatsfile)s | python %(scriptsdir)s/gff2bed.py -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink( tmpfilename ) ################################################################### ################################################################### @files( [ (PARAMS["%s_merge" % x], "%s.gtf.gz" % x) for x in P.asList(PARAMS["merge"])] +\ [ (EXPERIMENTAL_TRACKS, PARAMS["merged"] ) ] ) def buildMergedTracks( infiles, outfile ): '''merge tracks.''' infiles = " ".join(infiles) statement = ''' zcat %(infiles)s | python %(scriptsdir)s/gff2psl.py --log=%(outfile)s.log --is-gtf --allow-duplicates | python %(scriptsdir)s/psl2psl.py --log=%(outfile)s.log --method=rename-query --unique
P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "pipeline.ini"], defaults={"annotations_annotations_dir": "", "genesets_abinitio_coding": "pruned.gtf.gz", "genesets_abinitio_lncrna": "pruned.gtf.gz", "genesets_reference": "reference.gtf.gz", "genesets_refcoding": "refcoding.gtf.gz", "genesets_previous": ""}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters( PARAMS["annotations_annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__") PREVIOUS = P.asList(PARAMS["genesets_previous"]) ######################################################################### ######################################################################### ######################################################################### def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_annotations_database"])
def GeneListStats( infile, outfile ): track = P.snip( os.path.basename( infile), ".genelist.load" ).replace("-","_").replace(".","_") species = track[:2] #anno_base = PARAMS["annotations_dir"] species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) #ensembl_version = PARAMS["orthology_ensembl_version"] species_lookup = dict(zip(species_list, anno_list)) species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + PARAMS["database"] # Connect to database and attach annotation databases dbhandle = sqlite3.connect( PARAMS["database"] ) cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals() cc.execute( statement ) cc.close() # Extract data from db cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM %(track)s_genelist g, %(species)s.transcript_info t WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' ''' % locals() cc.execute( statement ) result = cc.fetchall() genes_with_feature = str(result[0][0]) cc.close() #print track + " genes_with_feature=" + genes_with_feature + "\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct gene_id) as genes FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals() cc.execute( statement ) result = cc.fetchall() total_genes = str(result[0][0]) cc.close() #print track + " total_protein_coding_genes =" + total_genes + "\n" proportion_with_feature = (float(genes_with_feature)/float(total_genes))*100 #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct set_id) as genes FROM ortholog_groups''' % locals() cc.execute( statement ) result = cc.fetchall() total_conserved_genes = str(result[0][0]) cc.close() #print "total_conserved_genes =" + total_conserved_genes + "\n" proportion_conserved = (float(total_conserved_genes)/float(total_genes))*100 #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM %(track)s_genelist g, %(species)s.transcript_info t, ortholog_groups o WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' AND o.gene_id=t.gene_id''' % locals() cc.execute( statement ) result = cc.fetchall() conserved_genes_with_feature = str(result[0][0]) cc.close() #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n" proportion_conserved_with_feature = (float(conserved_genes_with_feature)/float(total_conserved_genes))*100 #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n" # Write to file header = "genes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature" outs = open( outfile, "w") outs.write( "%s\n" % (header) ) outs.write( "%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature) ) outs.close()
suffix(".tsv"), ".load" ) def loadEdgeRStats( infile, outfile ): P.load( infile, outfile ) ################################################################### ################################################################### ################################################################### @follows( loadCufflinks, loadGeneLevelReadCounts ) def expression(): pass mapToTargets = { 'cuffdiff': loadCuffdiffStats, 'deseq': loadDESeqStats, 'edger': loadEdgeRStats, } TARGETS_DIFFEXPRESSION = [ mapToTargets[x] for x in P.asList( PARAMS["methods"] ) ] @follows( *TARGETS_DIFFEXPRESSION ) def diff_expression(): pass ################################################################### ################################################################### ################################################################### @jobs_limit(1,"R") @follows( mkdir("tagplots.dir"), aggregateFeatureCounts ) @files( [ (x, os.path.join( "tagplots.dir", y)) for x, y in TARGETS_DE ] ) def plotRNASEQTagData( infiles, outfile ): '''perform differential expression analysis using deseq.''' design_file = infiles[0] geneset_file = infiles[1]
# collect fastq.gz tracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" ) ALL = PipelineTracks.Sample3() EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) ################################################################### ## Global flags ################################################################### ASSEMBLERS = P.asList(PARAMS["general_assemblers"]) METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS ASSEMBLERS = P.asList(PARAMS["assemblers"]) MAPPER = PARAMS["coverage_mapper"] BOWTIE = MAPPER == "bowtie" BWA = MAPPER == "bwa" ################################################################### ################################################################### ################################################################### def connect(): '''connect to database. This method also attaches to helper databases.
P.run() statement = ''' (cd %(test_name)s.dir; python %(scriptsdir)s/%(pipeline_name)s.py %(pipeline_options)s make full) >& %(outfile)s ''' P.run() ################################################################### ################################################################### ################################################################### ## general tests ################################################################### @files( [ (os.path.join( PARAMS["data_dir"], x + ".dir"), x + ".log" ) for x in P.asList(PARAMS["prerequisites"]) ] ) def prepareTests( infile, outfile ): '''run pre-requisite pipelines.''' runTest( infile, outfile ) ################################################################### ################################################################### ################################################################### ## run a test ################################################################### @follows( prepareTests ) @files( [ (x, os.path.basename(x) + ".log" ) for x in \ glob.glob( os.path.join( PARAMS["data_dir"], "pipeline_*")) ] ) def runTests( infile, outfile ): '''run a pipeline with test data.''' runTest( infile, outfile )
def loadEdgeRStats(infile, outfile): P.load(infile, outfile) @follows(loadCufflinks, loadCufflinksFPKM, loadGeneLevelReadCounts) def expression(): pass mapToTargets = {'cuffdiff': loadCuffdiffStats, 'deseq': loadDESeqStats, 'edger': loadEdgeRStats, } TARGETS_DIFFEXPRESSION = [mapToTargets[x] for x in P.asList(PARAMS["methods"])] @follows(*TARGETS_DIFFEXPRESSION) def diff_expression(): pass @follows(diff_expression) @merge("*_stats.tsv", "de_stats.load") def loadDEStats(infiles, outfile): '''load DE stats into table.''' P.concatenateAndLoad(infiles, outfile, missing_value=0, regex_filename="(.*)_stats.tsv")
def GeneListStats(infile, outfile): track = P.snip(os.path.basename(infile), ".genelist.load").replace("-", "_").replace(".", "_") species = track[:2] #anno_base = PARAMS["annotations_dir"] species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) #ensembl_version = PARAMS["orthology_ensembl_version"] species_lookup = dict(zip(species_list, anno_list)) species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + PARAMS["database"] # Connect to database and attach annotation databases dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals( ) cc.execute(statement) cc.close() # Extract data from db cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM %(track)s_genelist g, %(species)s.transcript_info t WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' ''' % locals( ) cc.execute(statement) result = cc.fetchall() genes_with_feature = str(result[0][0]) cc.close() #print track + " genes_with_feature=" + genes_with_feature + "\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct gene_id) as genes FROM %(species)s.transcript_info where gene_biotype='protein_coding' ''' % locals( ) cc.execute(statement) result = cc.fetchall() total_genes = str(result[0][0]) cc.close() #print track + " total_protein_coding_genes =" + total_genes + "\n" proportion_with_feature = (float(genes_with_feature) / float(total_genes)) * 100 #print track + " proportion_with_feature =" + str(proportion_with_feature) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct set_id) as genes FROM ortholog_groups''' % locals() cc.execute(statement) result = cc.fetchall() total_conserved_genes = str(result[0][0]) cc.close() #print "total_conserved_genes =" + total_conserved_genes + "\n" proportion_conserved = (float(total_conserved_genes) / float(total_genes)) * 100 #print track + " proportion_conserved =" + str(proportion_conserved) + "%\n" cc = dbhandle.cursor() statement = '''SELECT count(distinct t.gene_id) as genes FROM %(track)s_genelist g, %(species)s.transcript_info t, ortholog_groups o WHERE g.gene_id=t.transcript_id and t.gene_biotype='protein_coding' AND o.gene_id=t.gene_id''' % locals() cc.execute(statement) result = cc.fetchall() conserved_genes_with_feature = str(result[0][0]) cc.close() #print track + " conserved_genes_with_feature=" + conserved_genes_with_feature + "\n" proportion_conserved_with_feature = (float(conserved_genes_with_feature) / float(total_conserved_genes)) * 100 #print track + " proportion_conserved_with_feature =" + str(proportion_conserved_with_feature) + "%\n" # Write to file header = "genes_with_feature\ttotal_genes\ttotal_conserved_genes\tconserved_genes_with_feature\tproportion_with_feature\tproportion_conserved\tproportion_conserved_with_feature" outs = open(outfile, "w") outs.write("%s\n" % (header)) outs.write("%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n" % (genes_with_feature, total_genes, total_conserved_genes, conserved_genes_with_feature, proportion_with_feature, proportion_conserved, proportion_conserved_with_feature)) outs.close()
def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track. By default, this method searches for ``track.bam`` file in the current directory and returns an offset of 0. Associations can be defined in the .ini file in the section [bams]. For example, the following snippet associates track track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`:: [bams] track1=track1.bam,track2.bam Glob expressions are permitted. Offsets are used to shift tags in ChIP experiments. Offsets need to be defined in the [offsets] sections. If no offsets are defined, the method returns a list of 0 offsets. Offsets need to be defined in the same order as the bam files:: [offsets] track1=120,200 returns a list of BAM files and offsets. Default tracks and offsets can be specified using a placeholder ``%``. The following will associate all tracks with the same bam file:: [bams] %=all.bam ''' fn = track.asFile() bamfiles = glob.glob("%s.bam" % fn) if bamfiles == []: if "bams_%s" % fn.lower() in PARAMS: for ff in P.asList(PARAMS["bams_%s" % fn.lower()]): bamfiles.extend(glob.glob(ff)) else: for pattern, value in P.CONFIG.items("bams"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): bamfiles.extend(glob.glob(value)) offsets = [] if "offsets_%s" % fn.lower() in PARAMS: offsets = map(int, P.asList(PARAMS["offsets_%s" % fn.lower()])) else: for pattern, value in P.CONFIG.items("offsets"): if "%" in pattern: p = re.sub("%", "\S+", pattern) if re.search(p, fn, re.IGNORECASE): offsets.extend(map(int, value.split(","))) if offsets == []: offsets = [0] * len(bamfiles) if len(bamfiles) != len(offsets): raise ValueError( "number of BAM files %s is not the same as number of offsets: %s" % (str(bamfiles), str(offsets))) return bamfiles, offsets
# do not run on cluster, mirror # that a pipeline is started from # the head node to_cluster = False statement = ''' (cd %(track)s.dir; python %(pipelinedir)s/%(pipeline_name)s.py %(pipeline_options)s make %(pipeline_targets)s) >& %(outfile)s ''' P.run() @follows(setupTests) @files([("%s.tgz" % x, "%s.log" % x) for x in P.asList(PARAMS.get("prerequisites", ""))]) def runPreparationTests(infile, outfile): '''run pre-requisite pipelines.''' runTest(infile, outfile) @follows(runPreparationTests) @files([("%s.tgz" % x, "%s.log" % x) for x in P.CONFIG.sections() if x.startswith("test") and x not in P.asList(PARAMS.get("prerequisites", ""))]) def runTests(infile, outfile): '''run a pipeline with test data.''' runTest(infile, outfile) @transform((runPreparationTests, runTests), suffix(".log"), ".md5")