def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run()
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run()
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed") tablename = "%s_intervals" % P.quote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = IOTools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=open(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def loadBAMStats(infiles, outfile): """import bam statisticis.""" header = ",".join([P.quote(P.snip(x, ".readstats")) for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --header=%(header)s --skip-titles --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | python %(scriptsdir)s/csv2db.py --table=%(tname)s >> %(outfile)s """ P.run()
def loadBAMStats(infiles, outfile): '''import bam statisticis.''' header = ",".join([P.quote(P.snip(x, ".readstats")) for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --header=%(header)s --skip-titles --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | python %(scriptsdir)s/csv2db.py --table=%(tname)s >> %(outfile)s """ P.run()
def loadTranscriptomeValidation(infiles, outfile): """load transcriptome validation data into database.""" to_cluster = USECLUSTER headers = ",".join([P.quote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = """ python %(scriptsdir)s/combine_tables.py --headers=%(headers)s %(infiles)s | python %(scriptsdir)s/table2table.py --transpose | perl -p -e "s/bin/track/" | python %(scriptsdir)s/csv2db.py --table=%(tablename)s > %(outfile)s """ P.run()
def loadTranscriptomeValidation(infiles, outfile): '''load transcriptome validation data into database.''' to_cluster = USECLUSTER headers = ",".join([P.quote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = ''' python %(scriptsdir)s/combine_tables.py --headers=%(headers)s %(infiles)s | python %(scriptsdir)s/table2table.py --transpose | perl -p -e "s/bin/track/" | python %(scriptsdir)s/csv2db.py --table=%(tablename)s > %(outfile)s ''' P.run()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) cc = dbhandle.cursor() if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.quote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info("writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % ( track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data] new_data.extend([(contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info("writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info("writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) cc = dbhandle.cursor() if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.quote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) ## modify the ranges if shift: if shift == "leftright": new_data = [ (contig, start - (end-start), start, str(interval_id) + "_left", peakcenter) \ for contig, start, end, interval_id, peakcenter in data ] new_data.extend( [ (contig, end, end + (end-start), str(interval_id) + "_right", peakcenter) \ for contig, start, end, interval_id, peakcenter in data ] ) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [ (contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) \ for contig, start, end, interval_id, peakcenter in data ] else: # remove peakcenter data = [ (contig, start, end, interval_id) \ for contig, start, end, interval_id, peakcenter in data ] ## get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % \ (track, maxsize, nseq, len(data) - nseq ) ) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def writeSequencesForIntervals( track, filename, dbhandle, full = False, halfwidth = None, maxsize = None, proportion = None, masker = [], offset = 0, shuffled = False, min_sequences = None ): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions ''' fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) ) cc = dbhandle.cursor() if PARAMS["score"] == "peakval": orderby = " ORDER BY peakval DESC" elif PARAMS["score"] == "max": orderby = " ORDER BY score DESC" elif PARAMS["score"] == "min": orderby = " ORDER BY score ASC" else: raise ValueError("Unknown value passed as score parameter, check your ini file") tablename = "%s_intervals" % P.quote( track ) if full: statement = '''SELECT start, end, interval_id, contig FROM %(tablename)s ''' % locals() + orderby elif halfwidth: statement = '''SELECT peakcenter - %(halfwidth)s, peakcenter + %(halfwidth)s, interval_id, contig FROM %(tablename)s ''' % locals() + orderby else: raise ValueError("either specify full or halfwidth" ) cc.execute( statement ) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max( cutoff, min_sequences ) else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff) ) L.info( "writeSequencesForIntervals %s: masker=%s" % (track,str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"]) ) sequences = [] current_size, nseq = 0, 0 new_data = [] for start, end, id, contig in data[:cutoff]: lcontig = fasta.getLength( contig ) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence( contig, "+", start, end ) sequences.append( seq ) new_data.append( (start, end, id, contig) ) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % \ (track, maxsize, nseq, len(data) - nseq ) ) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [ list(x) for x in sequences ] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences( ["".join(x) for x in sequences ], masker ) c = E.Counter() outs = IOTools.openFile(filename, "w" ) for masker in masker: sequences = maskSequences( sequences, masker ) for sequence, d in zip( sequences, data ): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i..%i" % (track, str(id), contig, start, end) outs.write( ">%s\n%s\n" % (id, sequence ) ) c.output += 1 outs.close() E.info("%s" % c ) return c.output