def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def dedup_bams(infile, outfile): '''Use MarkDuplicates to mark dupliceate reads''' tempfile = P.snip(outfile, ".bam") + ".temp.bam" metrics = P.snip(outfile, ".bam") + ".metrics.tsv" temporary = PARAMS["tmpdir"] statement = '''MarkDuplicates I=%(infile)s O=%(tempfile)s M=%(metrics)s TMP_DIR=%(temporary)s > %(outfile)s.log; checkpoint; samtools view -F 1024 -b %(tempfile)s > %(outfile)s; checkpoint; rm -r %(tempfile)s; checkpoint; samtools index %(outfile)s''' job_memory = "15G" P.run()
def runMutectReverse(infiles, outfile): '''Use control as tumor and vis versa to estimate false positive rate''' infile, normal_panel = infiles infile_tumour = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" basename = P.snip(outfile, ".mutect.reverse.snp.vcf") call_stats_out = basename + "_call_stats.reverse.out" coverage_wig_out = basename + "_coverage.reverse.wig" mutect_log = basename + ".reverse.log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile_tumour)
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS["annotations_interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --missing-value=0 %(filenames)s | cgat csv2db --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def reMergeBamfiles(infiles, sentinel): infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinel)
def quantifyWithSalmon(infiles, outfile): '''Quantify existing samples against genesets''' job_threads = 2 job_memory = "16G" infile, gtffile = infiles basefile = os.path.basename(infile) sample_name = basefile.split(os.extsep, 1) sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam" gtfbase = P.snip(os.path.basename(gtffile), ".gz") salmonIndex = "salmon_index/" + gtfbase + ".salmon.index" fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq" fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq" salmon_options = PARAMS["salmon_quantoptions"] statement = ''' samtools sort -n %(infile)s -o %(sorted_bam)s; samtools fastq -1 %(fastq1)s -2 %(fastq2)s -0 /dev/null -s /dev/null -n -F 0x900 %(sorted_bam)s; salmon quant -i %(salmonIndex)s --libType IU -1 %(fastq1)s -2 %(fastq2)s -o %(outfile)s %(salmon_options)s; mv %(outfile)s/quant.sf %(outfile)s.sf; rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s ''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") filename = "temp_bams/%s" % basefile tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, filename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join( ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"]) P.run(statement)
def poolSampleBamfiles(infiles, sentinel): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinel)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def runFIMO(motifs, database, outfile, exportdir, options={}): '''run fimo to look for occurances of motifs supplied in sequence database. :param:`motifs` is the path to a MEME formated motif file. :param:`database` is a fasta file. :param:`outfile` is the text output from fimo :param:`exportdir` specifies the directory to put exported files (html,gff) :param:options is a dictionary: {'option':'value'} will be passed as --option=value and will overwrite options specified in the PARAMs''' # if the motifs file is empty, then fimo will return an error # this isn't very useful behavoir. inlines = IOTools.open_file(motifs).read() #print inlines if not re.search("MOTIF", inlines): E.warning("No motifs found in %s" % motifs) P.touch(outfile) return else: E.debug("%s: %i motifs found" % (motifs, len(re.findall("MOTIF", inlines)))) fimo_options = PARAMS.get("fimo_options", "") for option, value in options.iteritems(): fimo_options = re.sub("%s=\S+" % option, "", fimo_options) if value is None: fimo_options += " --%s" % option else: fimo_options += " --%s=%s" % (option, value) tmpout = P.get_temp_filename() track = os.path.basename(outfile) exportdir = os.path.abspath(exportdir) xmlout = P.snip(outfile, ".txt") + ".xml" logfile = P.snip(outfile, ".txt") + ".log" gffout = os.path.join(exportdir, track + ".gff") htmlout = os.path.join(exportdir, track + ".html") statement = ''' fimo --oc %(tmpout)s %(fimo_options)s %(motifs)s %(database)s &> %(logfile)s; mv %(tmpout)s/fimo.txt %(outfile)s; mv %(tmpout)s/fimo.xml %(xmlout)s; mv %(tmpout)s/fimo.gff %(gffout)s; mv %(tmpout)s/fimo.html %(htmlout)s; rm -r %(tmpout)s ''' P.run(statement)
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[ str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[ str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def linkBamToWorkingDirs(infiles, outfile): ''' symlink the bam file and index to the working directories for execution of the transcript building pipeline ''' bamfile = P.snip(infiles[0], ".bai") indexfile = infiles[0] directories = [P.snip(logfile, ".log") for logfile in infiles[1]] for directory in directories: os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile)) os.symlink( os.path.abspath(indexfile), os.path.join(directory, indexfile)) updateFile(outfile)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run(statement)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zap_file(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zap_file(outfile2)
def loadPicardCoverageStats(infiles, outfile): '''import coverage statistics into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.get_temp_file(".") first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def find_utrons(infiles, outfiles): infile, reference, classfile = infiles job_threads = 2 job_memory = "16G" all_out, part_out, novel_out = outfiles track = P.snip(all_out, ".all_utrons.bed.gz") current_file = __file__ pipeline_path = os.path.abspath(current_file) pipeline_directory = os.path.dirname(pipeline_path) script_path = "pipeline_utrons/find_utrons.py" find_utrons_path = os.path.join(pipeline_directory, script_path) statement = '''cgat gtf2gtf -I %(infile)s --method=sort --sort-order=gene+transcript -L %(track)s.log | python %(find_utrons_path)s --reffile=%(reference)s --class-file=%(classfile)s --outfile %(all_out)s --partfile=%(part_out)s --novel-file=%(novel_out)s -L %(track)s.log''' P.run(statement)
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | cgat gtf2gtf --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | cgat gtf2gtf --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def run_test(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):]) pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node #to_cluster = False template_statement = ("cd %%(track)s.dir; " "xvfb-run -d cgatflow %%(pipeline_name)s " "%%(pipeline_options)s " "%%(workflow_options)s make %s " "-L ../%%(outfile)s " "-S ../%%(outfile)s.stdout " "-E ../%%(outfile)s.stderr") if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(statement, ignore_errors=True, job_memory="unlimited") else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(statement, ignore_errors=True, job_memory="unlimited")
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def makeRates(infiles, outfile): '''compute nucleotide substitution rates for transcripts from a gtf file - this applies only for transcripts mapped onto the reference genome. Sequences from the transcripts are mapped onto the rate genome. Softmasked sequence will be ignored unless track is in CONTROL_TRACKS. The longest contiguous block is selected ignoring matches to other parts of the genome. ''' infile_sequences, infile_gtf, alignment = infiles track = P.snip(infile_sequences, ".fasta") if track in TRACKS_CONTROL: # when aligning repeats, do not mask lower case characters mask = "" else: mask = "--mask-lowercase" # locate target genome from ancestral repeats ini file target_genome = os.path.join(PARAMS["genome_dir"], PARAMS_ANCESTRAL_REPEATS["target"]) statement = '''gunzip < %(infile_gtf)s | cgat gtf2gtf --method=sort --sort-order=gene | cgat gff2psl --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | pslMap stdin <(gunzip < %(alignment)s ) stdout | sort -k10,10 -k14,14 -k9,9 -k12,12n | cgat psl2psl --method=merge --log=%(outfile)s.log | cgat psl2psl --method=select-query --select=most-nmatches --log=%(outfile)s.log | cgat psl2psl --method=add-sequence --target-psl-file=%(target_genome)s --queries-tsv-file=%(infile_sequences)s --log=%(outfile)s.log | %(cmd-farm)s --split-at-lines=10000 --output-header --log=%(outfile)s.log "cgat psl2table %(mask)s --method=counts --method=baseml --baseml-model=REV" | gzip > %(outfile)s ''' P.run()
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.get_temp_file(".") for infile in infiles: table = P.to_table(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run(statement) os.unlink(tmpf.name)
def exportMotifDiscoverySequences(infile, outfile): '''export sequences for motif discovery. This method requires the _interval tables. For motif discovery, only the sequences with the highest S/N ratio are supplied. 1. The top *motifs_proportion* intervals sorted by peakval 2. Only a region +/- *motifs_halfwidth* around the peak 3. At least *motifs_min_sequences*. If there are not enough sequences to start with, all will be used. 4. At most *motifs_max_size* sequences will be output. ''' track = P.snip(infile, "_intervals.load") dbhandle = connect() p = P.substitute_parameters(**locals()) nseq = PipelineMotifs.writeSequencesForIntervals( track, outfile, dbhandle, full=False, masker=P.as_list(p['motifs_masker']), halfwidth=int(p["motifs_halfwidth"]), maxsize=int(p["motifs_max_size"]), proportion=p["motifs_proportion"], min_sequences=p["motifs_min_sequences"], num_sequences=p["motifs_num_sequences"], order=p['motifs_score']) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile)
def runSleuth(design, base_dir, model, contrasts, outfile, counts, tpm, fdr, lrt=False, reduced_model=None): ''' run sleuth. Note: all samples in the design table must also have a directory with the same name in `base_dir` with kallisto results in a file called abundance.h5''' outfile_prefix = P.snip(outfile, ".tsv") Design = Expression.ExperimentalDesign(design) exp = Expression.DEExperiment_Sleuth() res = exp.run(Design, base_dir, model, contrasts, outfile_prefix, counts, tpm, fdr, lrt, reduced_model) res.getResults(fdr) for contrast in set(res.table['contrast']): res.plotMA(contrast, outfile_prefix) res.plotVolcano(contrast, outfile_prefix) res.table.to_csv(outfile, sep="\t", index=False)