def assembleWithStringTie(infiles, outfile): infile, reference = infiles basefile = os.path.basename(infile) job_threads = PARAMS["stringtie_threads"] job_memory = PARAMS["stringtie_memory"] tmpfile = P.get_temp_filename() if os.path.exists(tmpfile): os.unlink(tmpfile) statement = ''' portcullis full -t 1 -o portcullis/%(basefile)s/ -r %(portcullis_bedref)s -b %(portcullis_fastaref)s %(infile)s && mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s && rm -r portcullis/%(basefile)s/ && stringtie %(tmpfile)s -p %(stringtie_threads)s -G <(zcat %(reference)s) %(stringtie_options)s 2> %(outfile)s.log | gzip > %(outfile)s && rm %(tmpfile)s''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, tmpfilename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join([ "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s" ]) P.run(statement)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) IOTools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def quantifyWithSalmon(infiles, outfile): '''Quantify existing samples against genesets''' job_threads = 2 job_memory = "16G" infile, gtffile = infiles basefile = os.path.basename(infile) sample_name = basefile.split(os.extsep, 1) sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam" gtfbase = P.snip(os.path.basename(gtffile), ".gz") salmonIndex = "salmon_index/" + gtfbase + ".salmon.index" fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq" fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq" salmon_options = PARAMS["salmon_quantoptions"] statement = ''' samtools sort -n %(infile)s -o %(sorted_bam)s; samtools fastq -1 %(fastq1)s -2 %(fastq2)s -0 /dev/null -s /dev/null -n -F 0x900 %(sorted_bam)s; salmon quant -i %(salmonIndex)s --libType IU -1 %(fastq1)s -2 %(fastq2)s -o %(outfile)s %(salmon_options)s; mv %(outfile)s/quant.sf %(outfile)s.sf; rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s ''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") filename = "temp_bams/%s" % basefile tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, filename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join( ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"]) P.run(statement)
def download(self, genes=None, fields=None, scope=None, species=None): ''' download an up to date ontology file, parse the xml data into a Python "ElementTree" and delete the ontology file. ''' ontologyfile = P.get_temp_filename(".") os.system("wget -O %s %s" % (ontologyfile, self.datasource)) tree = ET.parse(ontologyfile) os.remove(ontologyfile) self.dataset = tree
def runFIMO(motifs, database, outfile, exportdir, options={}): '''run fimo to look for occurances of motifs supplied in sequence database. :param:`motifs` is the path to a MEME formated motif file. :param:`database` is a fasta file. :param:`outfile` is the text output from fimo :param:`exportdir` specifies the directory to put exported files (html,gff) :param:options is a dictionary: {'option':'value'} will be passed as --option=value and will overwrite options specified in the PARAMs''' # if the motifs file is empty, then fimo will return an error # this isn't very useful behavoir. inlines = IOTools.open_file(motifs).read() #print inlines if not re.search("MOTIF", inlines): E.warning("No motifs found in %s" % motifs) P.touch(outfile) return else: E.debug("%s: %i motifs found" % (motifs, len(re.findall("MOTIF", inlines)))) fimo_options = PARAMS.get("fimo_options", "") for option, value in options.iteritems(): fimo_options = re.sub("%s=\S+" % option, "", fimo_options) if value is None: fimo_options += " --%s" % option else: fimo_options += " --%s=%s" % (option, value) tmpout = P.get_temp_filename() track = os.path.basename(outfile) exportdir = os.path.abspath(exportdir) xmlout = P.snip(outfile, ".txt") + ".xml" logfile = P.snip(outfile, ".txt") + ".log" gffout = os.path.join(exportdir, track + ".gff") htmlout = os.path.join(exportdir, track + ".html") statement = ''' fimo --oc %(tmpout)s %(fimo_options)s %(motifs)s %(database)s &> %(logfile)s; mv %(tmpout)s/fimo.txt %(outfile)s; mv %(tmpout)s/fimo.xml %(xmlout)s; mv %(tmpout)s/fimo.gff %(gffout)s; mv %(tmpout)s/fimo.html %(htmlout)s; rm -r %(tmpout)s ''' P.run(statement)
def buildRefFlat(infile, outfile): '''build flat geneset for Picard RnaSeqMetrics. ''' tmpflat = P.get_temp_filename(".") statement = ''' gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s; paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s) > %(outfile)s ''' P.run(statement, job_memory=PARAMS["job_memory"]) os.unlink(tmpflat)
def loadManualAnnotations(infile, outfile): tmp = P.get_temp_filename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.open_file(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.open_file(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def aggregateAdaptors(infiles, outfile): ''' Collate fasta files into a single contaminants file for adapter removal. ''' tempfile = P.get_temp_filename() infiles = " ".join(infiles) statement = """ cat %(infiles)s | fastx_reverse_complement > %(tempfile)s; cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s; rm -f %(tempfile)s """ P.run(statement)
def test_job_should_write_to_explicit_temp_and_not_clean_up(self): outfile = os.path.join(self.work_dir, "out") tmpfile = P.get_temp_filename(clear=True) P.run("hostname > {outfile}; " "echo {tmpfile} > {tmpfile}; " "cat {tmpfile} >> {outfile}".format(outfile=outfile, tmpfile=tmpfile), to_cluster=False) with IOTools.open_file(outfile) as outf: hostname = outf.readline().strip() tmpfile_read = outf.readline().strip() self.assertEqual(tmpfile, tmpfile_read) self.assertTrue(self.file_exists(tmpfile, hostname, expect=True)) os.unlink(tmpfile)
def makeCytoscapeInputs(infiles, outfile): infile = infiles[1] T = P.get_temp_filename(".") statement = """ awk -F "\\t" '{printf("%%%%s\\t%%%%s\\t%%%%s\\t%%%%s\\t+1\\n",\ $1, $12, $8, $9)}' %(infile)s > %(T)s""" % locals() P.run(statement) typ = infile.split("_")[-3] keep = [ line.strip() for line in IOTools.open_file(PARAMS['cytoscape_%s' % typ]).readlines() ] tab = pd.read_csv(T, sep="\t") tab = tab[tab['term_id'].isin(keep)] tab.columns = ['ID', 'Description', 'pvalue', 'padj', 'Phenotype'] tab.to_csv(outfile, sep="\t", index=None) os.remove(T)
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.get_temp_filename() statement = ''' cgat annotator2tsv \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-identifier="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run(statement) tmpfile = P.get_temp_file() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' cgat csv2db %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(list(locals().items()) + list(P.get_params().items()))) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def makeSalmonIndex(infile, outfile): # Long transcripts cause indexing to use lots of memory? job_memory = "64G" job_threads = 1 gtf_basename = P.snip(os.path.basename(infile), ".gtf.gz") transcript_fasta = "salmon_index/" + gtf_basename + "transcripts.fa" fastaref = PARAMS["portcullis_fastaref"] index_options = PARAMS["salmon_indexoptions"] tmpfile = P.get_temp_filename() statement = ''' gunzip -c %(infile)s > %(tmpfile)s; gffread %(tmpfile)s -g %(fastaref)s -w %(transcript_fasta)s; salmon index -p %(job_threads)s %(index_options)s -t %(transcript_fasta)s -i %(outfile)s --perfectHash; rm %(tmpfile)s ''' P.run(statement)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.is_empty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if IOTools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) statement = "gzip < %(tmpfile)s > %(outfile)s" P.run(statement) shutil.rmtree(tmpdir) os.unlink(tmpfile)
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]): IOTools.touch_file(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run(statement) else: tmpfile = P.get_temp_filename(".") # need to merge incrementally fn = infiles[0] if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run(statement) for fn in infiles[1:]: if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run(statement) statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run(statement) os.unlink(tmpfile)
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) IOTools.touch_file(outfile) return ######################################## # output and load spiked results tmpfile_name = P.get_temp_filename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = IOTools.open_file(tmpfile_name, "w") tmpfile.write("\t".join(("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = IOTools.open_file(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join( map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join( map(str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.to_table( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.is_paired(bamfile) current_file = bamfile tmpdir = P.get_temp_filename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.quality.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | cgat bam2bam --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.nonunique.log 2> %%(bedfile)s.nonunique.err > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''picard MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT >& %%(bedfile)s.markdup.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | cgat bam2bed --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.bam2bed.log - 2> %(bedfile)s.bam2bed.err | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log 2> %(bedfile)s.sanitize.err | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | cgat bam2bed --log=%(bedfile)s.bam2bed.log - 2> %(bedfile)s.bam2bed.err | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log 2> %(bedfile)s.sanitize.err | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s >& %(bedfile)s.tabix.log") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run(statement, job_memory="8G")
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ """<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles ]) tmpfile = P.get_temp_filename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run(statement) # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.open_file(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run(statement) return tmpfile1 = P.get_temp_filename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | cgat gff2fasta --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run(statement) if IOTools.is_empty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) IOTools.touch_file(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) os.unlink(tmpfile1) inf = IOTools.open_file("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """) ]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ((len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.open_file(outfile, "w") inf = GTF.iterator(IOTools.open_file(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def buildNUMTs(infile, outfile): '''output set of potential nuclear mitochondrial genes (NUMTs). This function works by aligning the mitochondrial chromosome against genome using exonerate_. This can take a while. Arguments --------- infile : string Ignored. outfile : filename Output in :term:`gtf` format with potential NUMTs. ''' if not PARAMS["numts_mitochrom"]: E.info("skipping numts creation") IOTools.touch_file(outfile) return fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) if PARAMS["numts_mitochrom"] not in fasta: E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"]) IOTools.touch_file(outfile) return tmpfile_mito = P.get_temp_filename(".") statement = ''' cgat index_fasta --extract=%(numts_mitochrom)s --log=%(outfile)s.log %(genome_dir)s/%(genome)s > %(tmpfile_mito)s ''' P.run(statement) if IOTools.is_empty(tmpfile_mito): E.warn("mitochondrial genome empty.") os.unlink(tmpfile_mito) IOTools.touch_file(outfile) return format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi", "C") format = "\\\\t".join(["%%%s" % x for x in format]) # collect all results min_score = 100 statement = ''' cat %(genome_dir)s/%(genome)s.fasta | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(tmpfile_mito)s --model affine:local --score %(min_score)i --showalignment no --showsugar no --showcigar no --showvulgar no --ryo \\"%(format)s\\n\\" " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) # convert to gtf inf = IOTools.open_file("%s.links.gz" % outfile) outf = IOTools.open_file(outfile, "w") min_score = PARAMS["numts_score"] c = E.Counter() for line in inf: (query_contig, query_strand, query_start, query_end, target_contig, target_strand, target_start, target_end, score, pid, alignment) = line[:-1].split("\t") c.input += 1 score = int(score) if score < min_score: c.skipped += 1 continue if target_strand == "-": target_start, target_end = target_end, target_start gff = GTF.Entry() gff.contig = target_contig gff.start, gff.end = int(target_start), int(target_end) assert gff.start < gff.end gff.strand = target_strand gff.score = int(score) gff.feature = "numts" gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end) gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end) outf.write("%s\n" % str(gff)) c.output += 1 inf.close() outf.close() E.info("filtering numts: %s" % str(c)) os.unlink(tmpfile_mito)
def summarizeTagsWithinContext(tagfile, contextfile, outfile, min_overlap=0.5, job_memory="15G"): '''count occurances of tags in genomic context. Examines the genomic context to where tags align. A tag is assigned to the genomic context that it overlaps by at least 50%. Thus some reads mapping several contexts might be dropped. Arguments --------- tagfile : string Filename with tags. The file can be :term:`bam` or :term:`bed` format. contextfile : string Filename of :term:`bed` formatted files with named intervals (BED4). outfile : string Output in :term:`tsv` format. min_overlap : float Minimum overlap (fraction) to count features as overlapping. job_memory : string Memory to reserve. ''' tmpfile = P.get_temp_filename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(2)] statement = ''' cgat bam_vs_bed --min-overlap=%(min_overlap)f --log=%(outfile)s.log %(tagfile)s %(contextfile)s > %(tmpfile)s_0 ''' P.run(statement) statement = ''' printf "intergenic\\t" >> %(tmpfile)s_1''' P.run(statement) statement = ''' bedtools intersect -a %(tagfile)s -b %(contextfile)s -bed -v | wc -l | xargs printf >> %(tmpfile)s_1 ''' P.run(statement) files = " ".join(tmpfiles) statement = ''' sort --merge %(files)s | gzip > %(outfile)s ''' P.run(statement) for x in tmpfiles: os.unlink(x)
def resetGTFAttributes(infile, genome, gene_ids, outfile): """set GTF attributes in :term:`gtf` formatted file so that they are compatible with cufflinks. This method runs cuffcompare with `infile` against itself to add attributes such as p_id and tss_id. Arguments --------- infile : string Filename of :term:`gtf`-formatted input file genome : string Filename (without extension) of indexed genome file in :term:`fasta` format. gene_ids : dict Dictionary mapping transcript ids to gene ids. outfile : string Output filename in :term:`gtf` format """ tmpfile1 = P.get_temp_filename(".") tmpfile2 = P.get_temp_filename(".") ################################################# E.info("adding tss_id and p_id") # The p_id attribute is set if the fasta sequence is given. # However, there might be some errors in cuffdiff downstream: # # cuffdiff: bundles.cpp:479: static void HitBundle::combine(const std:: # vector<HitBundle*, std::allocator<HitBundle*> >&, HitBundle&): Assertion # `in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id()' failed. # # I was not able to resolve this, it was a complex # bug dependent on both the read libraries and the input reference gtf # files job_memory = "5G" statement = ''' cuffcompare -r <( gunzip < %(infile)s ) -T -s %(genome)s.fa -o %(tmpfile1)s <( gunzip < %(infile)s ) <( gunzip < %(infile)s ) > %(outfile)s.log ''' P.run(statement) ################################################# E.info("resetting gene_id and transcript_id") # reset gene_id and transcript_id to ENSEMBL ids # cufflinks patch: # make tss_id and p_id unique for each gene id outf = IOTools.open_file(tmpfile2, "w") map_tss2gene, map_pid2gene = {}, {} inf = IOTools.open_file(tmpfile1 + ".combined.gtf") def _map(gtf, key, val, m): if val in m: while gene_id != m[val]: val += "a" if val not in m: break m[val] = gene_id gtf.setAttribute(key, val) for gtf in GTF.iterator(inf): transcript_id = gtf.oId gene_id = gene_ids[transcript_id] gtf.setAttribute("transcript_id", transcript_id) gtf.setAttribute("gene_id", gene_id) # set tss_id try: tss_id = gtf.tss_id except AttributeError: tss_id = None try: p_id = gtf.p_id except AttributeError: p_id = None if tss_id: _map(gtf, "tss_id", tss_id, map_tss2gene) if p_id: _map(gtf, "p_id", p_id, map_pid2gene) outf.write(str(gtf) + "\n") outf.close() # sort gtf file PipelineGeneset.sortGTF(tmpfile2, outfile) # make sure tmpfile1 is NEVER empty assert tmpfile1 for x in glob.glob(tmpfile1 + "*"): os.unlink(x) os.unlink(tmpfile2)
def buildGenomicContext(infiles, outfile, distance=10): '''build a :term:`bed` formatted file with genomic context. The output is a bed formatted file, annotating genomic segments according to whether they are any of the ENSEMBL annotations. The function also adds the RNA and repeats annotations from the UCSC. The annotations can be partially or fully overlapping. The annotations can be partially or fully overlapping. Adjacent features (less than 10 bp apart) of the same type are merged. Arguments --------- infiles : list A list of input files to generate annotations from. The contents are 1. ``repeats``, a :term:`gff` formatted file with repeat annotations 2. ``rna``, a :term:`gff` formatted file with small, repetetive RNA annotations 3. ``annotations``, a :term:`gtf` formatted file with genomic annotations, see :func:`annotateGenome`. 4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see :func:`buildFlatGeneSet`. outfile : string Output filename in :term:`bed` format. distance : int Merge adajcent features of the same type within this distance. ''' repeats_gff, rna_gff, annotations_gtf, geneset_flat_gff, \ cpgisland_bed, go_tsv = infiles tmpfile = P.get_temp_filename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(6)] # add ENSEMBL annotations statement = """ zcat %(annotations_gtf)s | cgat gtf2gtf --method=sort --sort-order=gene | cgat gtf2gtf --method=merge-exons --log=%(outfile)s.log | cgat gff2bed --set-name=gene_biotype --is-gtf --log=%(outfile)s.log | sort -k 1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_0 """ P.run(statement) # rna statement = ''' zcat %(repeats_gff)s %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_1''' P.run(statement) # add aggregate intervals for repeats statement = ''' zcat %(repeats_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | awk -v OFS="\\t" '{$4 = "repeats"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_2''' P.run(statement) # add aggregate intervals for rna statement = ''' zcat %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | awk -v OFS="\\t" '{$4 = "repetetive_rna"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_3 ''' P.run(statement) # add ribosomal protein coding genes goids = ("GO:0003735", ) patterns = "-e %s" % ("-e ".join(goids)) statement = ''' zcat %(geneset_flat_gff)s | cgat gtf2gtf --map-tsv-file=<(zcat %(go_tsv)s | grep %(patterns)s | cut -f 2 | sort | uniq) --method=filter --filter-method=gene --log=%(outfile)s.log | cgat gff2bed --log=%(outfile)s.log | awk -v OFS="\\t" '{$4 = "ribosomal_coding"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_4 ''' P.run(statement) # CpG islands statement = ''' zcat %(cpgisland_bed)s | awk '{printf("%%s\\t%%i\\t%%i\\tcpgisland\\n", $1,$2,$3 )}' > %(tmpfile)s_5 ''' P.run(statement) # sort and merge # remove strand information as bedtools # complains if there are annotations with # different number of field files = " ".join(tmpfiles) statement = ''' sort --merge -k1,1 -k2,2n %(files)s | cut -f 1-4 | gzip > %(outfile)s ''' P.run(statement) for x in tmpfiles: os.unlink(x)
def buildGenomicContext(infiles, outfile, distance=10, job_memory="4G"): '''build a :term:`bed` formatted file with genomic context. The output is a bed formatted file, annotating genomic segments according to whether they are any of the ENSEMBL annotations. The function also adds the RNA and repeats annotations from the UCSC. The annotations can be partially or fully overlapping. The annotations can be partially or fully overlapping. Adjacent features (less than 10 bp apart) of the same type are merged. Arguments --------- infiles : list A list of input files to generate annotations from. The contents are 1. ``repeats``, a :term:`gff` formatted file with repeat annotations 2. ``rna``, a :term:`gff` formatted file with small, repetetive RNA annotations 3. ``annotations``, a :term:`gtf` formatted file with genomic annotations, see :func:`annotateGenome`. 4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see :func:`buildFlatGeneSet`. outfile : string Output filename in :term:`bed` format. distance : int Merge adajcent features of the same type within this distance. ''' repeats_gff, rna_gff, annotations_gtf, utr_gtf, intron_gtf = infiles tmpfile = P.get_temp_filename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(4)] # add ENSEMBL annotations statement = """ zcat %(annotations_gtf)s | cgat gtf2gtf --method=sort --sort-order=gene | cgat gtf2gtf --method=merge-exons --log=%(outfile)s.log | cgat gff2bed --set-name=gene_biotype --is-gtf --log=%(outfile)s.log | sort -k 1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_0 """ P.run(statement, job_memory=job_memory) # rna statement = ''' zcat %(repeats_gff)s %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_1''' P.run(statement, job_memory=job_memory) # utr statement = '''zcat %(utr_gtf)s | cgat gff2bed --is-gtf --set-name=feature | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_2''' P.run(statement, job_memory=job_memory) # intron statement = '''zcat %(intron_gtf)s | cgat gff2bed --is-gtf --set-name=feature | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_3''' P.run(statement, job_memory=job_memory) # sort and merge # remove strand information as bedtools # complains if there are annotations with # different number of field files = " ".join(tmpfiles) statement = ''' sort --merge -k1,1 -k2,2n %(files)s | cut -f 1-4 | gzip > %(outfile)s ''' P.run(statement, job_memory=job_memory) for x in tmpfiles: os.unlink(x)