def loadTranscriptStats(infile, outfile): '''compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str") statement = ''' gunzip < %(infile)s |\ cgat gtf2table \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run(statement)
def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: context_stats bam_stats ''' dbh = connect() tablename = P.to_table(outfile) view_type = "TABLE" tables = (( "bam_stats", "track", ), ( "context_stats", "track", )) # do not use: ("picard_stats_alignment_summary_metrics", "track"),) # as there are multiple rows per track for paired-ended data. P.create_view(dbh, tables, tablename, outfile, view_type)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.get_temp_file(".") for infile in infiles: table = P.to_table(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run(statement) os.unlink(tmpf.name)
def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | cgat gtf2table --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run(statement)
def loadPeptideSequences(infile, outfile): '''load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-protein_id" "--map=protein_id:str") statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | cgat fasta2fasta --method=filter --filter-method=min-length=1 | cgat fasta2table --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ load_statement = P.build_load_statement( P.to_table(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run(statement, job_memory=PARAMS["job_memory"])
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run(statement)
def loadMutectFilteringSummary(infile, outfile): '''Load mutect extended output into database''' dbh = connect() tablename = P.to_table(outfile) statement = '''cat %(infile)s | cgat csv2db --table %(tablename)s --retry --ignore-empty > %(outfile)s''' P.run(statement)
def loadGeneInformation(infile, outfile, only_proteincoding=False, job_memory="4G"): '''load gene-related attributes from :term:`gtf` file into database. This method takes transcript-associated features from an :term:`gtf` file and collects the gene-related attributes in the 9th column of the gtf file, ignoring exon_id, transcript_id, transcript_name, protein_id and exon_number. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename, contains logging information. The table name is derived from the filename of outfile. only_proteincoding : bool If True, only consider protein coding genes. ''' table = P.to_table(outfile) if only_proteincoding: filter_cmd = """cgat gtf2gtf --method=filter --filter-method=proteincoding""" % PARAMS else: filter_cmd = "cat" load_statement = P.build_load_statement(table, options="--add-index=gene_id " "--add-index=gene_name" "--map=gene_name:str") statement = ''' zcat %(infile)s | %(filter_cmd)s | grep "transcript_id" | cgat gtf2gtf --method=sort --sort-order=gene+transcript | cgat gtf2tsv --attributes-as-columns --output-only-attributes -v 0 | cgat csv-cut --remove exon_id transcript_id transcript_name protein_id exon_number | (read h; echo \"$h\"; sort ) " | uniq | %(load_statement)s > %(outfile)s''' P.run(statement, job_memory=job_memory)
def loadVCFstats(infiles, outfile): '''Import variant statistics into SQLite''' filenames = " ".join(infiles) tablename = P.to_table(outfile) csv2db_options = PARAMS["csv2db_options"] E.info("Loading vcf stats...") statement = '''cgat vcfstats2db %(filenames)s >> %(outfile)s; ''' statement += '''cat vcfstats.txt | cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=track --table=vcf_stats >> %(outfile)s; ''' P.run(statement)
def exportPeakLocations(infile, outfile): '''export peak locations ''' dbh = connect() outf = IOTools.open_file(outfile, "w") cc = dbh.cursor() table = P.to_table(infile) for x in cc.execute("""SELECT contig, peakcenter, peakcenter+1, interval_id, peakval FROM %(table)s """ % locals()): outf.write("\t".join(map(str, x)) + "\n") outf.close()
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.open_file(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadMotifSequenceComposition(infile, outfile): '''compute sequence composition of sequences used for ab-initio search.''' load_statement = P.build_load_statement( P.to_table(outfile)) statement = ''' cgat fasta2table --section=na --log=%(outfile)s < %(infile)s | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadMotifSequenceComposition(infile, outfile): '''compute sequence composition of sequences used for ab-initio search.''' tablename = P.to_table(outfile) statement = ''' cgat fasta2table --section=na --log=%(outfile)s < %(infile)s | cgat csv2db %(csv2db_options)s --table=%(tablename)s > %(outfile)s''' P.run(statement)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.to_table(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("id") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.get_temp_file(".") # parse the text file for line in IOTools.open_file(infile): if line.startswith("#Query"): tmpfile.write('\t'.join(("target_name", "query_id", "target_id", "optimal_offset", "pvalue", "evalue", "qvalue", "Overlap", "query_consensus", "target_consensus", "orientation")) + "\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadProteinStats(infile, outfile): '''compute and load protein sequence properties into database. The method computes amino acid composition, length, and hash for each peptide sequence. The method calls :doc:`fasta2table` with the following counters: * length - protein sequence length * hid - protein sequence hash identifier * aa - protein sequence composition Arguments --------- infile : string Fiename of ENSEMBL peptide file in :term:`fasta` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=protein_id " "--map=protein_id:str") statement = ''' gunzip < %(infile)s | cgat fasta2fasta --method=filter --filter-method=min-length=1 | awk 'match($0, /(>[a-zA-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]} !/^>/ {print}' | cgat fasta2table --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadGeneCoordinates(infile, outfile): '''merge transcripts to generate the genomic coordinates per gene and load ''' # TS. remove transcript_id column as this is now meaningless load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts | cgat gtf2tsv | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadTranscript2Gene(infile, outfile): '''build a map of transcript to gene from gtf file and load into database. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--add-index=transcript_id ") statement = ''' gunzip < %(infile)s | cgat gtf2tsv --output-map=transcript2gene -v 0 | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadIntervals(infile, outfile): '''load intervals from :term:`bed` formatted files into database. ''' bedfile = infile track = Sample(filename=P.snip(infile, ".bed.gz")) bamfiles, offsets = getAssociatedBAMFiles(track) control = "" if bamfiles: E.info("%s: associated bamfiles = %s" % (track, bamfiles)) else: E.info("%s: no bamfiles associated" % (track)) assert (len(bamfiles) == 1) bamfile = bamfiles[0] offset = offsets[0] tablename = P.to_table(outfile) statement = '''zcat %(bedfile)s | awk '{printf("%%s\\t%%i\\t%%i\\t%%i\\n", $1,$2,$3,++a)}' | cgat bed2table --counter=peaks --bam-file=%(bamfile)s --offset=%(offset)i --bed-header=contig,start,end,interval_id %(control)s --output-all-fields --log=%(outfile)s | cgat csv2db %(csv2db_options)s --add-index=contig,start --add-index=interval_id --table=%(tablename)s --allow-empty-file > %(outfile)s''' P.run(statement)
def loadStrandSpecificity(infiles, outfile, suffix="strand", tablename=None): ''' ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") table_count = 0 table_join = None for infile in infiles: name = P.snip(os.path.basename(infile), ".strand") table = pd.read_csv(infile, sep="\t", comment="#") table["track"] = name if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["MSR", "ISR", "OSR", "ISF", "MSF", "OSF", "SF", "SR", "track"], how="outer") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadmiRNATranscripts(infile, outfile): '''load transcripts from a GFF3 file into the database. Arguments --------- infile : string ENSEMBL geneset in :term:`gff3` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--allow-empty-file " "--header-names=feature,Name") statement = ''' export LANG=en_GB.UTF-8 && zcat %(infile)s | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null | grep -v "#" | cut -f3,12 |%(load_statement)s > %(outfile)s''' P.run(statement, job_memory=PARAMS["job_memory"])
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2tsv | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) IOTools.touch_file(outfile) return ######################################## # output and load spiked results tmpfile_name = P.get_temp_filename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = IOTools.open_file(tmpfile_name, "w") tmpfile.write("\t".join(("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = IOTools.open_file(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join( map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join( map(str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.to_table( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def loadBAMStats(infiles, outfile): '''load output of :func:`buildBAMStats` into database. Arguments --------- infiles : string Input files, output from :func:`buildBAMStats`. outfile : string Logfile. The table name will be derived from `outfile`. ''' header = ",".join([P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.to_table(outfile) load_statement = P.build_load_statement( tablename, options="--add-index=track " " --allow-empty-file") E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s""" P.run(statement) for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options="--allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run(statement) # load mapping qualities, there are two columns per row # 'all_reads' and 'filtered_reads' # Here, only filtered_reads are used (--take=3) for suffix in ("mapq",): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options=" --allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty --take=3 %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run(statement)
def loadEnsemblTranscriptInformation(ensembl_gtf, geneset_gtf, outfile, csvdb, set_biotype=None, set_transcript_support=None): ''' Parse and annotate a geneset_gtf using the original Ensembl GTF attributes. The ensembl GTF structure is not static, so this needs to maintain backwards compatibility. For certain versions, attributes may be present in later versions which are used downstream. These should be set with default/missing values if they are not natively present. Therefore, gene_biotype is taken from the "feature" field if it is not present, and transcript_support = NA if missing. Arguments --------- ensembl_gtf: string PATH to ensemlb gtf containing all annotation information and attributes geneset_gtf: string PATH to the geneset GTF to annotate with ensembl attributes outfile: string PATH to output filtered, annotated and sorted by gene position csvdb: string PATH to the SQLite database to upload transcript information table ensembl_version: int Ensembl build version used set_biotype: string should the gene_ and transcript_biotype columns be set to a default value. If false, and not present, default value is to use the "feature" attribute set_transcript_support: int should the transcript_support_level be set to a default value, if not it will be set to NA ''' table = P.to_table(outfile) gtf_file = IOTools.open_file(geneset_gtf, "rb") gtf_iterator = GTF.transcript_iterator(GTF.iterator(gtf_file)) ensembl_file = IOTools.open_file(ensembl_gtf, "rb") ensembl_iterator = GTF.transcript_iterator(GTF.iterator(ensembl_file)) # parse the two gtfs, creating keys from the GTF entries parse_ensembl = {} for ens_gtf in ensembl_iterator: for ens_trans in ens_gtf: ens_att = ens_trans.asDict() ens_vals = dict( zip(ens_trans.keys(), [ens_trans[x] for x in ens_trans.keys()])) ens_att.update(ens_vals) parse_ensembl[ens_trans.transcript_id] = ens_att ensembl_file.close() parse_gtf = {} for gtf in gtf_iterator: for trans in gtf: trans_atts = trans.asDict() trans_vals = dict( zip(trans.keys(), [trans[g] for g in trans.keys()])) trans_atts.update(trans_vals) parse_gtf[trans.transcript_id] = trans_atts gtf_file.close() # convert to dataframe for easier merging, annotating # and ultimately SQL database insertion # these are large dictionaries to parse, so might # be quite memory and compute heavy ensembl_df = pd.DataFrame(parse_ensembl).T gtf_df = pd.DataFrame(parse_gtf).T # check for presence of gene_biotype and # transcript_support_level merged_df = pd.merge(gtf_df, ensembl_df, left_on=[cx for cx in gtf_df.columns], right_on=[rx for rx in gtf_df.columns], how='left') try: merged_df["transcript_support_level"] E.info("transcript_support_level is present") except KeyError: E.info("transcript_support_level is not present") if set_transcript_support: merged_df["transcript_support_level"] = set_transcript_support else: merged_df["transcript_support_level"] = "NA" try: merged_df["gene_biotype"] E.info("gene biotype is present") try: merged_df["transcript_biotype"] E.info("transcript biotype is present") except KeyError: E.info("transcript biotype is not present") if set_biotype: merged_df["transcript_biotype"] = set_biotype else: merged_df["transcript_biotype"] = "NA" except KeyError: E.info("gene biotype is not present") if set_biotype: merged_df["gene_biotype"] = set_biotype merged_df["transcript_biotype"] = set_biotype else: merged_df["gene_biotype"] = "NA" merged_df["transcript_biotype"] = "NA" # sort on gene then transcript id # remove exon_number and exon_id to maintain # compatibility with previous code try: merged_df.drop(["exon_id", "exon_number"], axis=1, inplace=True) except KeyError: try: merged_df.drop(["exon_id"], axis=1, inplace=True) except KeyError: try: merged_df.drop(["exon_number"], axis=1, inplace=True) except KeyError: pass # sort the output and load into the csvdb # add a multindex to use multiple SQL indices merged_df.sort_values(by=["gene_id", "transcript_id"], inplace=True) merged_df.set_index( ["gene_id", "gene_name", "protein_id", "transcript_id"], inplace=True, drop=True) merged_df.to_sql( name=table, con=sqlite3.connect(csvdb), if_exists='replace', index_label=["gene_id", "gene_name", "protein_id", "transcript_id"]) return 1
def loadPicardMetrics(infiles, outfile, suffix, pipeline_suffix=".picard_stats", tablename=None): '''load picard metrics. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") filenames = ["%s.%s" % (x, suffix) for x in infiles] first = True for filename in filenames: track = P.snip(os.path.basename(filename), "%s.%s" % (pipeline_suffix, suffix)) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.open_file(filename, "r").readlines() # extract metrics part rx_start = re.compile("## METRICS CLASS") for n, line in enumerate(lines): if rx_start.search(line): lines = lines[n + 1:] break for n, line in enumerate(lines): if not line.strip(): lines = lines[:n] break if len(lines) == 0: E.warn("no lines in %s: %s" % (track, filename)) continue if first: outf.write("%s\t%s" % ("track", lines[0])) fields = lines[0][:-1].split("\t") else: f = lines[0][:-1].split("\t") if f != fields: raise ValueError( "file %s has different fields: expected %s, got %s" % (filename, fields, f)) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load(outf.name, outfile, tablename=tablename, options="--add-index=track --allow-empty-file") os.unlink(outf.name)