def applyThreshold(infile, fasta, threshold, max_distance=0): '''apply threshold to a wig file writing a bed-formatted file as output.''' c = E.Counter() for contig, size in list( fasta.getContigSizes(with_synonyms=False).items()): c.contigs += 1 E.debug("processing %s" % contig) last_start, last_end = -1, 0 for start, end, value in block_iterator(infile, contig, size): d = start - last_end if (d > 0 or value < threshold): if last_start >= 0: yield contig, last_start, last_end c.intervals += 1 last_start = -1 elif last_start < 0 and value >= threshold: last_start = start last_end = end if last_start >= 0: yield contig, last_start, end c.intervals += 1 c.output += 1 E.info(str(c))
def checkRequirementsFromAllModules(): all_modules = sys.modules counter = E.Counter() results = [] for module in list(sys.modules.keys()): if all_modules[module] is not None: results.extend( checkRequirementsFromModule(all_modules[module], counter)) return counter, results
def ReadGene2GOFromFile(infile, synonyms={}, obsolete={}): """reads GO mappings for all go_types from a file. If synonyms is given, goids in synynoms will be translated. Terms in *obsolete* will be discarded. returns two maps: gene2go maps genes to go categories and go2info maps go categories to information. """ gene2gos = {} go2infos = {} c = E.Counter() for line in infile: if line[0] == "#": continue try: go_type, gene_id, goid, description, evidence = line[:-1].split( "\t") except ValueError as msg: raise ValueError("parsing error in line '%s': %s" % (line[:-1], msg)) if go_type == "go_type": continue c.input += 1 if goid in synonyms: c.synonyms += 1 goid = synonyms[goid] if goid in obsolete: c.obsolete += 1 continue gm = GOMatch(goid, go_type, description, evidence) gi = GOInfo(goid, go_type, description) if go_type not in gene2gos: gene2gos[go_type] = {} go2infos[go_type] = {} gene2go = gene2gos[go_type] go2info = go2infos[go_type] if gene_id not in gene2go: gene2go[gene_id] = [] gene2go[gene_id].append(gm) go2info[goid] = gi c.output += 1 E.debug("read gene2go assignments: %s" % str(c)) return gene2gos, go2infos
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("script", "module"), help="type of tests to create [%default].") parser.set_defaults(method="script") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) == 0: raise ValueError( "setup_test.py requires one or more command line arguments") targetdir = os.path.dirname(__file__) counter = E.Counter() for arg in args: counter.input += 1 script_dirname, basename = os.path.split(arg) dirname = os.path.join(targetdir, basename) if os.path.exists(dirname): E.warn("%s already exists - skipping" % basename) counter.skipped += 1 continue os.mkdir(dirname) with open(os.path.join(dirname, "tests.yaml"), "w") as outf: outf.write(YAML_TEMPLATE) counter.created += 1 E.info("%s" % str(counter)) # write footer and output benchmark information. E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("length", ), help="methods to apply [%default]") parser.set_defaults( methods=[], input_fastq_file=None, ) (options, args) = E.start(parser, argv) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if options.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 options.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run, do not delete any files [%default]") parser.set_defaults(dry_run=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filenames = args c = E.Counter() for filename in filenames: c.checked += 1 if os.path.exists(filename + ".log"): if IOTools.isComplete(filename + ".log"): c.complete += 1 continue if IOTools.isComplete(filename): c.complete += 1 continue c.incomplete += 1 E.info('deleting %s' % filename) if options.dry_run: continue os.unlink(filename) c.deleted += 1 E.info(c) # write footer and output benchmark information. E.Stop()
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = PARAMS.get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = IOTools.open_file(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = IOTools.open_file(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = IOTools.zap_file(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write( "%s\t%s\t%s\t%s\n" % (fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) get_logger().info("zapped: %s" % (c)) outfile.close() return c
def pair_iterator(test_vcf, truth_vcf, contig): counter = E.Counter() test_iter = test_vcf.fetch(contig) truth_iter = truth_vcf.fetch(contig) test_record = next(test_iter) truth_record = next(truth_iter) try: while 1: if test_record.pos < truth_record.pos: test_record = next(test_iter) continue elif test_record.pos > truth_record.pos: truth_record = next(truth_iter) continue elif len(test_record.alts) > 1: counter.skip_test_truth += 1 test_record = next(test_iter) continue elif len(truth_record.alts) > 1: counter.skip_multiallelic_truth += 1 truth_record = next(truth_iter) continue elif test_record.alts != truth_record.alts: counter.skip_genotype_difference += 1 test_record = next(test_iter) truth_record = next(truth_iter) continue if test_record.ref != truth_record.ref: # todo: deal with indels raise ValueError("mismatching reference bases at position " "{}:{}".format(test_record.chrom, test_record.pos)) yield test_record, truth_record test_record = next(test_iter) truth_record = next(truth_iter) except StopIteration: pass E.debug(str(counter))
def read_and_randomize_rows(infile, options): """read table from stdin and randomize rows, keeping header.""" c = E.Counter() if options.has_headers: keep_header = 1 else: keep_header = 0 for x in range(keep_header): c.header += 1 options.stdout.write(infile.readline()) lines = infile.readlines() c.lines_input = len(lines) random.shuffle(lines) options.stdout.write("".join(lines)) c.lines_output = len(lines) E.info(c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--keep-header", dest="keep_header", type="int", help="randomize, but keep header in place [%default]") parser.set_defaults(keep_header=0) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) inf = options.stdin outf = options.stdout c = E.Counter() for x in range(options.keep_header): c.header += 1 outf.write(inf.readline()) lines = inf.readlines() c.lines_input = len(lines) random.shuffle(lines) for line in lines: outf.write(line) c.lines_output = len(lines) E.info(c) # write footer and output benchmark information. E.stop()
def read_vcf_positions_into_dataframe(filename, filters=None): vcf_in = pysam.VariantFile(filename) if filters is None: filters = [] pass_filter = False snp_filter = False for f in filters: if f == "PASS": pass_filter = True elif f == "SNP": snp_filter = True records = [] c = E.Counter() for record in vcf_in: c.input += 1 f = record.filter.keys() if pass_filter and "PASS" not in f and "." not in f: c.removed_pass_filter += 1 continue if snp_filter: is_snp = (len(record.ref) == 1 and len(record.alts) == 1 and len(record.alts[0]) == 1) if not is_snp: c.removed_snp_filter += 1 continue c.output += 1 records.append((record.chrom, record.pos)) df = pandas.DataFrame.from_records(records, columns=["chrom", "pos"]) E.info("{}: {}".format(filename, c)) return df
def buildMRBed(infile, outfile): '''output bed6 file with methylated regions. All regions are output, even the insignificant ones. The score is the log fold change. ''' outf = IOTools.openFile(outfile, "w") c = E.Counter() for row in csv.DictReader(IOTools.openFile(infile), dialect="excel-tab"): c.input += 1 contig, start, end = re.match("(.*):(\d+)-(\d+)", row["interval_id"]).groups() c.output += 1 outf.write("\t".join((contig, start, end, str(c.input), row["lfold"])) + "\n") outf.close() E.info("%s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--database", dest="database", type="string", help="bioconductor database to use [default=%default].") parser.add_option("-m", "--mapping", dest="database", type="string", help="bioconductor mapping to use [default=%default].") parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with the gene set in gtf format [default=%default].") parser.set_defaults( database="mouse4302.db", mapping="ENSEMBL", filename_gtf=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) prefix = options.database[:-len(".db")] mapping_probeset2gene = prefix + options.mapping mapping_probeset2loc = prefix + "CHRLOC" probeset2gene = getProbeset2Gene(database=options.database, ) probeset2location = getProbeset2Location(database=options.database, ) # gtf = GTF.readAndIndex( # GTF.iterator( IOTools.open_file( options.filename_gtf ) ) ) counts = E.Counter() outfile_notfound = IOTools.open_file("notfound.table", "w") options.stdout.write("probeset_id\tgene_id\tngenes\n") for probeset, locations in probeset2location.items(): counts.probesets += 1 gene_ids = probeset2gene[probeset] if len(gene_ids) == 0: counts.notfound += 1 continue for gene_id in gene_ids: options.stdout.write("%s\t%s\t%i\n" % (probeset, gene_id, len(gene_ids))) counts.output += 1 E.info("%s" % str(counts)) # write footer and output benchmark information. E.stop()
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.openOutputFile("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.openOutputFile("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def loadMACS(infile, outfile, bamfile, tablename=None): '''load MACS results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.macs`. This method creates two optional additional files: * if the file :file:`<track>_diag.xls` is present, load MACS diagnostic data into the table :file:`<track>_macsdiag`. * if the file :file:`<track>_model.r` is present, call R to create a MACS peak-shift plot and save it as :file:`<track>_model.pdf` in the :file:`export/MACS` directory. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. ''' track = P.snip(os.path.basename(infile), ".macs") folder = os.path.dirname(infile) if len(folder) > 0: infilename = folder + "/" + track + "_peaks.xls" filename_diag = folder + "/" + track + "_diag.xls" filename_r = folder + "/" + track + "_model.r" filename_rlog = folder + "/" + track + ".r.log" filename_pdf = track + "_model.pdf" else: infilename = track + "_peaks.xls" filename_diag = track + "_diag.xls" filename_r = track + "_model.r" filename_rlog = track + ".r.log" filename_pdf = track + "_model.pdf" if not os.path.exists(infilename): E.warn("could not find %s" % infilename) P.touch(outfile) return # create plot by calling R if os.path.exists(filename_r): if len(folder) > 0: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; ''' else: statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; ''' P.run() # filter peaks shift = getPeakShiftFromMacs(infile) assert shift is not None, "could not determine peak shift from MACS file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") id = 0 # get thresholds max_qvalue = float(PARAMS["macs_max_qvalue"]) # min, as it is -10log10 min_pvalue = float(PARAMS["macs_min_pvalue"]) counter = E.Counter() with IOTools.openFile(infilename, "r") as ins: for peak in WrapperMACS.iteratePeaks(ins): if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue elif peak.pvalue < min_pvalue: counter.removed_pvalue += 1 continue assert peak.start < peak.end npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.start, peak.end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.start, peak.end, npeaks, peakcenter, length, avgval, peakval, nreads, peak.pvalue, peak.fold, peak.fdr, peak.start + peak.summit - 1, peak.tags))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_macs_intervals" % track statement = '''cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename) # load diagnostic data if os.path.exists(filename_diag): tablename = "%s_macsdiag" % track statement = ''' cat %(filename_diag)s | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" | cgat csv2db %(csv2db_options)s --map=fc:str --table=%(tablename)s >> %(outfile)s ''' P.run()
def loadIntervalsFromBed(bedfile, track, outfile, bamfiles, offsets): '''load intervals from :term:`bed` formatted files into database. Re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval ''' tmpfile = P.getTempFile(".") headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position", "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end") tmpfile.write("\t".join(headers) + "\n") avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, mlength = int(PARAMS["calling_merge_min_interval_length"]) c = E.Counter() # count tags for bed in Bed.iterator(IOTools.openFile(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input # remove very short intervals if bed.end - bed.start < mlength: c.skipped_length += 1 continue if replicates: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelineChipseq.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the # overlap region. However, most of these intervals should # be small and have already be deleted via the # merge_min_interval_length cutoff. do not output # intervals without reads. if nprobes == 0: c.skipped_reads += 1 else: npeaks, peakcenter, length, avgval, peakval, nprobes = ( 1, bed.start + (bed.end - bed.start) // 2, bed.end - bed.start, 1, 1, 1) c.output += 1 tmpfile.write("\t".join(map( str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, ncpgs, ngenes, npeaks, nprobes, npromoters, bed.contig, bed.start, bed.end))) + "\n") if c.output == 0: E.warn("%s - no intervals") tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_intervals" % track.asTable() statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif IOTools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info("removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with IOTools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ """<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles ]) tmpfile = P.get_temp_filename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run(statement) # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.open_file(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = IOTools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = _bam2bed.merge_pairs( samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def annotateCpGIslands(infiles, outfile): '''annotate transcript by absence/presence of CpG islands ''' cpgfile, tssfile = infiles cpg = Bed.readAndIndex(IOTools.openFile(cpgfile)) extension_upstream = PARAMS["cpg_search_upstream"] extension_downstream = PARAMS["cpg_search_downstream"] c = E.Counter() outf = IOTools.openFile(outfile, "w") outf.write( "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n") for tss in Bed.iterator(IOTools.openFile(tssfile)): c.tss_total += 1 if tss.strand == "+": start, end = tss.start - \ extension_upstream, tss.start + extension_downstream else: start, end = tss.end - \ extension_downstream, tss.end + extension_upstream try: matches = list(cpg[tss.contig].find(start, end)) except KeyError: c.promotor_without_matches += 1 continue if len(matches) == 0: c.promotor_without_matches += 1 continue c.promotor_output += 1 for match in matches: c.matches_total += 1 genome_start, genome_end, x = match l = genome_end - genome_start # get relative location of match if tss.strand == "+": relative_start = genome_start - tss.start else: relative_start = tss.end - genome_end relative_end = relative_start + l outf.write("\t".join( map(str, (tss.name, tss.strand, genome_start, genome_end, relative_start, relative_end))) + "\n") c.matches_output += 1 outf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file") parser.add_option( "-f", "--reference-bam", dest="reference_bam_file", type="string", help="reference BAM file [%default]") parser.add_option( "-q", "--query-name-regex", dest="query_name_regex", type="string", help="regular expression to apply on query name. " "Potentially required to match samtools sort order and should " "evaluate to an integer [%default]") parser.set_defaults( input_bam_file=None, reference_bam_file=None, query_name_regex=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 2: options.input_bam_file = args[0] options.reference_bam_file = args[1] if options.input_bam_file is None: raise ValueError("please supply a BAM file as input") if options.reference_bam_file is None: raise ValueError("please supply a BAM file as reference") # update paths to absolute options.input_bam_file = os.path.abspath(options.input_bam_file) options.reference_bam_file = os.path.abspath(options.reference_bam_file) if not os.path.exists(options.input_bam_file): raise OSError("input bam file {} does not exist".format( options.input_bam_file)) if not os.path.exists(options.reference_bam_file): raise OSError("reference bam file {} does not exist".format( options.reference_bam_file)) bam_in = pysam.AlignmentFile(options.input_bam_file) ref_in = pysam.AlignmentFile(options.reference_bam_file) outf_mapped = E.open_output_file("mapped") outf_mapped.write("\t".join( ["read", "length", "status", "overlap", "comp_contig", "comp_start", "comp_end", "ref_contig", "ref_start", "ref_end", "shared_misaligned", "shared_aligned", "shared_insertion", "shared_deletion", "comp_aligned", "comp_insertion", "comp_deletion", "ref_aligned", "ref_insertion", "ref_deletion"]) + "\n") outf_missing = E.open_output_file("missing") outf_missing.write("\t".join( ["read", "length", "status", "aligned", "insertion", "deletion"]) + "\n") counter = E.Counter() if options.query_name_regex: rx = re.compile(options.query_name_regex) def extract_query(x): return int(rx.search(x).groups()[0]) qname_fn = None if options.query_name_regex: qname_fn = extract_query for reads_cmp, read_ref in group_pairs(iterate_read_pairs( bam_in.fetch(until_eof=True), ref_in.fetch(until_eof=True), qname_fn=qname_fn)): if len(reads_cmp) == 0: counter.missing += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "missing") + count_pairs(pairs_ref))) + "\n") continue if len(reads_cmp) > 1: # multiple matches counter.multi_mapping += 1 prefix = "multi_" else: counter.unique_mapping += 1 prefix = "unique_" is_mapped = False for read_cmp in reads_cmp: counter.paired += 1 if read_cmp.is_unmapped: counter.unmapped += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "unmapped") + count_pairs(pairs_ref))) + "\n") continue overlap = max(0, (min(read_cmp.reference_end, read_ref.reference_end) - max(read_cmp.reference_start, read_ref.reference_start))) pairs_cmp = set(read_cmp.get_aligned_pairs()) pairs_ref = set(read_ref.get_aligned_pairs()) shared_cmp = pairs_cmp.intersection(pairs_ref) unique_cmp = pairs_cmp.difference(pairs_ref) missaligned = len([x for x, y in unique_cmp if x is not None and y is not None]) if read_cmp.reference_name != read_ref.reference_name or \ overlap == 0: status = "mismapped" else: counter.overlap += 1 status = "mapped" is_mapped = True outf_mapped.write("\t".join( map(str, (read_cmp.query_name, read_cmp.query_length, prefix + status, overlap, read_cmp.reference_name, read_cmp.reference_start, read_cmp.reference_end, read_ref.reference_name, read_ref.reference_start, read_ref.reference_end, missaligned) + count_pairs(shared_cmp) + count_pairs(pairs_cmp) + count_pairs(pairs_ref))) + "\n") else: if is_mapped: status = "mapped" else: status = "mismapped" counter[prefix + status] += 1 with E.open_output_file("summary") as outf: outf.write("category\tcounts\n") outf.write(counter.asTable() + "\n") E.stop()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None, stranded=False): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' cc = dbhandle.cursor() orderby = "" if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" elif order != "random": raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() E.debug("Got %s intervals for track %s" % (len(data), track)) if len(data) == 0: P.touch(filename) return data = truncateList(data, track, proportion, min_sequences, num_sequences, order == "random") beds = bedsFromList(data) L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # At the moment the pipeline retrieves from the DB the bed regions and they will # always be in the positive strand but if this were to change. The regions retrieved from # the negative strand will be counted from the end of the chromosome and not the beginning without this. # This should be tested. fasta.setConverter(IndexedFasta.getConverter("zero-single-open")) # modify the ranges if shift == "leftright": beds = shitfBeds(beds) if halfwidth and not full: beds = centreAndCrop(beds, halfwidth) sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize) if shuffled: sequences = shuffleFasta(sequences) c = E.Counter() outs = IOTools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): ids, sequences = zip(*[(x.title, x.sequence) for x in sequences]) sequences = maskSequences(sequences, masker) sequences = (FastaRecord(id, seq) for id, seq in zip(ids, sequences)) with IOTools.open_file(filename, "w") as outs: for sequence in sequences: c.input += 1 if len(sequence.sequence) == 0: c.empty += 1 continue if len(sequence.sequence) < 0: c.too_short += 1 continue outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def mergeAndFilterGTF(infile, outfile, logfile, genome, max_intron_size=None, remove_contigs=None, rna_file=None): '''sanitize transcripts file for cufflinks analysis. Merge exons separated by small introns (< 5bp). Transcripts will be ignored that * have very long introns (`max_intron_size`) (otherwise, cufflinks complains) * are located on contigs to be ignored (usually: chrM, _random, ...) Optionally remove transcripts based on repetitive sequences by supplying a repetitve `rna_file`. This method preserves all features in a gtf file (exon, CDS, ...). Arguments --------- infile : string Input filename in :term:`gtf` format outfile : string Output filename in :term:`gtf` format logfile : string Output filename for logging information. genome : string Filename (without extension) of indexed genome file in :term:`fasta` format. max_intron_size : int Remove transripts with introns larger than this value. remove_contigs : string Remove transcripts on contigs matching this regular expression string. rna_file : string Filename of :term:`gff` formatted file with repetetive sequences. If given, all transcripts overlapping any regions in this file will be removed. Returns ------- kept_genes : dict a dictionary of all gene_ids that have been kept. ''' c = E.Counter() outf = gzip.open(outfile, "w") E.info("filtering by contig and removing long introns") contigs = set(IndexedFasta.IndexedFasta(genome).getContigs()) rx_contigs = None # if remove_contigs is not None: rx_contigs = re.compile(remove_contigs) E.info("removing contigs %s" % remove_contigs) rna_index = None if rna_file is not None: if not os.path.exists(rna_file): E.warn("file '%s' to remove repetetive rna does not exist" % rna_file) else: rna_index = GTF.readAndIndex( GTF.iterator(IOTools.open_file(rna_file, "r"))) E.info("removing ribosomal RNA in %s" % rna_file) gene_ids = {} logf = IOTools.open_file(logfile, "w") logf.write("gene_id\ttranscript_id\treason\n") for all_exons in GTF.transcript_iterator( GTF.iterator(IOTools.open_file(infile))): c.input += 1 e = all_exons[0] # filtering if e.contig not in contigs: c.missing_contig += 1 logf.write( "\t".join((e.gene_id, e.transcript_id, "missing_contig")) + "\n") continue if rx_contigs and rx_contigs.search(e.contig): c.remove_contig += 1 logf.write( "\t".join((e.gene_id, e.transcript_id, "remove_contig")) + "\n") continue if rna_index and all_exons[0].source != 'protein_coding': found = False for exon in all_exons: if rna_index.contains(e.contig, e.start, e.end): found = True break if found: logf.write( "\t".join((e.gene_id, e.transcript_id, "overlap_rna")) + "\n") c.overlap_rna += 1 continue is_ok = True # keep exons and cds separate by grouping by feature all_exons.sort(key=lambda x: x.feature) new_exons = [] for feature, exons in itertools.groupby( all_exons, lambda x: x.feature): tmp = sorted(list(exons), key=lambda x: x.start) gene_ids[tmp[0].transcript_id] = tmp[0].gene_id l, n = tmp[0], [] for e in tmp[1:]: d = e.start - l.end if max_intron_size and d > max_intron_size: is_ok = False break elif d < 5: l.end = max(e.end, l.end) c.merged += 1 continue n.append(l) l = e n.append(l) new_exons.extend(n) if not is_ok: break if not is_ok: logf.write( "\t".join((e.gene_id, e.transcript_id, "bad_transcript")) + "\n") c.skipped += 1 continue new_exons.sort(key=lambda x: (x.start, x.gene_id, x.transcript_id)) for e in new_exons: outf.write("%s\n" % str(e)) c.exons += 1 c.output += 1 outf.close() L.info("%s" % str(c)) return gene_ids
def readSegments(infile, indexed_workspace, truncate=False, format="gtf", keep_ambiguous=False, remove_overhangs=False): """read segments from infile. segments not overlapping with indexed_workspace are removed. If :attr: truncate is given, segments extending beyond the workspace are truncated. returns a list of segments for each contig in a dictionary """ counter = E.Counter() segments = collections.defaultdict(list) def addSegment(contig, start, end, counter): if contig in indexed_workspace: r = indexed_workspace[contig].find(start, end) if not r: counter.nskipped += 1 return if len(r) > 1: counter.nambiguous += 1 if not keep_ambiguous: return if truncate: for x in r: wstart, wend = x.start, x.end rstart, rend = max(start, wstart), min(end, wend) if start < wstart or end > wend: counter.ntruncated += 1 segments[contig].append((rstart, rend)) counter.added += 1 elif remove_overhangs: for x in r: wstart, wend = x.start, x.end rstart, rend = max(start, wstart), min(end, wend) if start < wstart or end > wend: counter.overhangs += 1 break else: segments[contig].append((start, end)) else: segments[contig].append((start, end)) counter.added += 1 counter.nkept += 1 if format == "gtf": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile)) for gene in gtf_iterator: # get start and end ignoring introns # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] ) contig, coords = gene[0].contig, [(x.start, x.end) for x in gene] counter.ninput += 1 for start, end in coords: addSegment(contig, start, end, counter) elif format == "bed": bed_iterator = Bed.iterator(infile) for bed in bed_iterator: counter.ninput += 1 addSegment(bed.contig, bed.start, bed.end, counter) E.info("read segments: %s" % str(counter)) return segments
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts", "taxa-counts", "individual"), help="summarise the taxa counts - no. phyla etc") parser.add_option("--output-map", dest="output_map", action="store_true", help="ouput map of taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.output_map: found = [] options.stdout.write("""Domain\t \ kingdom\t \ phylum\t \ class\t \ order\t \ family\t \ genus\t \ species\n""") # only output the mapping file - do not continue # summarise regardless of the specified options for lca in LCA.iterate(options.stdin): # if bacteria or archaea the kingdom will # be the domain if lca.domain == "Bacteria" or lca.domain == "Archaea": kingdom = lca.domain else: kingdom = lca.kingdom hierarchy = [ lca.domain, kingdom, lca.phylum, lca._class, lca.order, lca.family, lca.genus, lca.species ] if hierarchy in found: continue else: found.append(hierarchy) options.stdout.write("\t".join(hierarchy) + "\n") return if options.summarise == "level-counts": level_counts = collections.defaultdict(set) total = 0 nreads_domain = 0 nreads_kingdom = 0 nreads_kingdom_plus = 0 nreads_phylum = 0 nreads_phylum_plus = 0 nreads_class = 0 nreads_class_plus = 0 nreads_order = 0 nreads_order_plus = 0 nreads_family = 0 nreads_family_plus = 0 nreads_genus = 0 nreads_genus_plus = 0 nreads_species = 0 nreads_species_plus = 0 nreads_subspecies = 0 nreads_subspecies_plus = 0 c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": nreads_domain += 1 level_counts["domain"].add(lca.domain) else: c.kingdom_unmapped += 1 if lca.kingdom != "NA": nreads_kingdom += 1 level_counts["kingdom"].add(lca.kingdom) else: c.kingdom_unmapped += 1 if lca.kingdom_plus != "NA": nreads_kingdom_plus += 1 level_counts["kingdom+"].add(lca.kingdom_plus) else: c.kingdom_plus_unmapped += 1 if lca.phylum != "NA": nreads_phylum += 1 level_counts["phylum"].add(lca.phylum) else: c.phylum_unmapped += 1 if lca.phylum_plus != "NA": nreads_phylum_plus += 1 level_counts["phylum+"].add(lca.phylum_plus) else: c.phylum_plus_unmapped += 1 if lca._class != "NA": nreads_class += 1 level_counts["class"].add(lca._class) else: c.class_unmapped += 1 if lca._class_plus != "NA": nreads_class_plus += 1 level_counts["class+"].add(lca._class_plus) else: c.class_plus_unmapped += 1 if lca.order != "NA": nreads_order += 1 level_counts["order"].add(lca.order) else: c.order_unmapped += 1 if lca.order_plus != "NA": nreads_order_plus += 1 level_counts["order+"].add(lca.order_plus) else: c.order_plus_unmapped += 1 if lca.family != "NA": nreads_family += 1 level_counts["family"].add(lca.family) else: c.family_unmapped += 1 if lca.family != "NA": nreads_family_plus == 1 level_counts["family+"].add(lca.family_plus) else: c.family_plus_unmapped += 1 if lca.genus != "NA": nreads_genus += 1 level_counts["genus"].add(lca.genus) else: c.genus_unmapped += 1 if lca.genus_plus != "NA": nreads_genus_plus == 1 level_counts["genus+"].add(lca.genus_plus) else: c.genus_plus_unmapped += 1 if lca.species != "NA": nreads_species += 1 level_counts["species"].add(lca.species) else: c.species_unmapped += 1 if lca.species_plus != "NA": nreads_species_plus += 1 level_counts["species+"].add(lca.species_plus) else: c.species_plus_unmapped += 1 # removed subspecies mapping for the time # being # if lca.subspecies != "NA": # nreads_subspecies += 1 # level_counts["subspecies"].add(lca.subspecies) # else: # c.subspecies_unmapped += 1 # if lca.subspecies_plus != "NA": # nreads_subspecies_plus += 1 # level_counts["subspecies+"].add(lca.subspecies_plus) # else: # c.subspecies_plus_unmapped += 1 options.stdout.write("\t".join([ "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+" ]) + "\n") options.stdout.write("\t".join( map(str, [ len(level_counts["domain"]), len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(level_counts["species"]), len(level_counts["species+"]), nreads_domain, nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus ])) + "\n") elif options.summarise == "taxa-counts": unmapped = collections.defaultdict(int) total = 0 taxa_counts = { "domain": collections.defaultdict(int), "kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int) } c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": taxa_counts["domain"][lca.domain] += 1 else: c.kingdom_unmapped += 1 unmapped["domain"] += 1 if lca.kingdom != "NA": taxa_counts["kingdom"][lca.kingdom] += 1 else: c.kingdom_unmapped += 1 unmapped["kingdom"] += 1 if lca.kingdom_plus != "NA": taxa_counts["kingdom+"][lca.kingdom_plus] += 1 else: c.kingdom_plus_unmapped += 1 unmapped["kingdom+"] += 1 if lca.phylum != "NA": taxa_counts["phylum"][lca.phylum] += 1 else: c.phylum_unmapped += 1 unmapped["phylum"] += 1 if lca.phylum_plus != "NA": taxa_counts["phylum+"][lca.phylum_plus] += 1 else: c.phylum_plus_unmapped += 1 unmapped["phylum+"] += 1 if lca._class != "NA": taxa_counts["class"][lca._class] += 1 else: c.class_unmapped += 1 unmapped["class"] += 1 if lca._class_plus != "NA": taxa_counts["class+"][lca._class_plus] += 1 else: c.class_plus_unmapped += 1 unmapped["class+"] += 1 if lca.order != "NA": taxa_counts["order"][lca.order] += 1 else: c.order_unmapped += 1 unmapped["order"] += 1 if lca.order_plus != "NA": taxa_counts["order+"][lca.order_plus] += 1 else: c.order_plus_unmapped += 1 unmapped["order+"] += 1 if lca.family != "NA": taxa_counts["family"][lca.family] += 1 else: c.family_unmapped += 1 unmapped["family"] += 1 if lca.family_plus != "NA": taxa_counts["family+"][lca.family_plus] += 1 else: c.family_plus_unmapped += 1 unmapped["family+"] += 1 if lca.genus != "NA": taxa_counts["genus"][lca.genus] += 1 else: c.genus_unmapped += 1 unmapped["genus"] += 1 if lca.genus_plus != "NA": taxa_counts["genus+"][lca.genus_plus] += 1 else: c.genus_plus_unmapped += 1 unmapped["genus+"] += 1 if lca.species != "NA": taxa_counts["species"][lca.species] += 1 else: c.species_unmapped += 1 unmapped["species"] += 1 if lca.species_plus != "NA": taxa_counts["species+"][lca.species_plus] += 1 else: c.species_plus_unmapped += 1 unmapped["species+"] += 1 options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n") for level, taxa_count in sorted(taxa_counts.items()): total_level = total - unmapped[level] for taxa, count in sorted(taxa_count.items()): options.stdout.write("\t".join([ level, taxa, str(count), "{:.8}".format(float(count) / total_level), "{:.8}". format(float(count) / (float(total_level) / 1000000)) ]) + "\n") E.info(c) elif options.summarise == "individual": # each read is output with its respective # taxon assignments options.stdout.write("\t".join([ "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order", "order+", "family", "family+", "genus", "genus+", "species", "species+" ]) + "\n") for lca in LCA.iterate(options.stdin): options.stdout.write("\t".join([ lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order, lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus ]) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('reconcile', ), help="method to apply [default=%default].") parser.add_option("-c", "--chop-identifier", dest="chop", action="store_true", help="whether or not to trim last character of the " "sequence name. For example sometimes ids in the first " "file in the pair will end with \1 and the second " "with \2. If --chop-identifier is not specified " "then the results will be wrong [default=%default].") parser.add_option("-u", "--unpaired", dest="unpaired", action="store_true", help="whether or not to write out unpaired reads " "to a separate file") parser.add_option("--id-pattern-1", dest="id_pattern_1", help="If specified will use the first group from the" "pattern to determine the ID for the first read", default=None) parser.add_option("--id-pattern-2", dest="id_pattern_2", help="As above but for read 2", default=None) parser.add_option("-o", "--output-filename-pattern", dest="output_pattern", type="string", help="pattern for output files [default=%default].") parser.set_defaults( method="reconcile", chop=False, unpaired=False, output_pattern="%s.fastq.gz", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() if options.id_pattern_1: id1_getter = PatternGetter(options.id_pattern_1) else: id1_getter = plain_getter if options.id_pattern_2: id2_getter = PatternGetter(options.id_pattern_2) else: id2_getter = plain_getter if options.method == "reconcile": # IMS: switching to no store second set of read names and only use # lazily. Since generators don't have a size must keep track id_lengths = {fn1: 0, fn2: 0} def getIds(infile, id_getter=plain_getter): '''return ids in infile.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) # decide if to chop read number off id_lengths[infile.name] += 1 if options.chop: yield r[:-1] else: yield r def write(outfile, infile, take, unpaired_file=None, id_getter=plain_getter): '''filter fastq files with ids in take.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) if options.chop: r = r[:-1] if r not in take: if unpaired_file is None: continue else: unpaired_file.write("\n".join(l) + "\n") else: outfile.write("\n".join(l) + "\n") E.info("reading first in pair") inf1 = IOTools.open_file(fn1) ids1 = set(getIds(inf1, id1_getter)) E.info("reading second in pair") inf2 = IOTools.open_file(fn2) # IMS: No longer keep as a set, but lazily evaluate into intersection # leads to large memory saving for large inf2, particularly if # inf1 is small. ids2 = getIds(inf2, id2_getter) take = ids1.intersection(ids2) E.info("first pair: %i reads, second pair: %i reads, " "shared: %i reads" % (id_lengths[fn1], id_lengths[fn2], len(take))) if options.unpaired: unpaired_filename = IOTools.open_file( options.output_pattern % "unpaired", "w") else: unpaired_filename = None with IOTools.open_file(options.output_pattern % "1", "w") as outf: inf = IOTools.open_file(fn1) E.info("writing first in pair") write(outf, inf, take, unpaired_filename, id1_getter) with IOTools.open_file(options.output_pattern % "2", "w") as outf: inf = IOTools.open_file(fn2) E.info("writing second in pair") write(outf, inf, take, unpaired_filename, id2_getter) if options.unpaired: unpaired_filename.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--reference-bed-file", dest="reference_bed_file", type="string", help="reference bed file " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("lvc-comparison", ), help="methods to apply [%default]") parser.set_defaults(method="lvc-comparison", reference_fasta_file=None, input_bed_file=None, size_bins=(1000, 10000, 100000), output_sets=True, region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) reference_set = collections.defaultdict(quicksect.IntervalTree) E.info("reading reference bed file from {}".format( options.reference_bed_file)) with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): mm = reference_set[record.contig] mm.add(record.start, record.end) E.info("read reference intervals on {} contigs: {}".format( len(list(reference_set.keys())), ",".join(list(reference_set.keys())))) if options.output_sets: output_tp = E.open_output_file("tp") output_fp = E.open_output_file("fp") output_fn = E.open_output_file("fn") else: output_tp = None output_fp = None output_fn = None if options.method == "lvc-comparison": c = E.Counter() found = set() counts = {} names = set() nsize_bins = len(options.size_bins) for bin in range(len(options.size_bins) + 1): counts[bin] = dict([(x, collections.defaultdict(int)) for x in ("tp", "fn", "fp", "test", "truth")]) for record in pysam.tabix_iterator(options.stdin, pysam.asBed()): if record.contig not in reference_set: c.ignored_no_contig += 1 continue c.test += 1 matches = reference_set[record.contig].search( record.start, record.end) size = record.end - record.start bin = get_size_bin(size, options.size_bins) if len(matches) == 0: c.fp += 1 status = "fp" if output_fp: output_fp.write(str(record) + "\n") elif len(matches) >= 1: c.tp += 1 status = "tp" if output_tp: output_tp.write(str(record) + "\n") # todo: overlap criteria # record found for match in matches: found.add((record.contig, match.start, match.end)) name = record.name.split(",")[0] names.add(name) counts[bin]["test"][name] += 1 counts[bin][status][name] += 1 outf = options.stdout with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): c.truth += 1 bin = get_size_bin(record.end - record.start, options.size_bins) counts[bin]["truth"]["all"] += 1 key = (record.contig, record.start, record.end) if key not in found: c.fn += 1 counts[bin]["fn"]["all"] += 1 outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth", "fn")) + "\n") for name in sorted(names): for bin in range(len(options.size_bins) + 1): if bin == len(options.size_bins): size_bin = ">={}".format(options.size_bins[-1]) else: size_bin = "<{}".format(options.size_bins[bin]) outf.write("\t".join( map(str, ( name, size_bin, counts[bin]["test"][name], counts[bin]["tp"][name], counts[bin]["fp"][name], counts[bin]["truth"]["all"], counts[bin]["fn"]["all"], ))) + "\n") E.info(str(c)) E.stop()
def pslMap(options): """thread psl alignments using intervals. """ if options.format == "gtf": use_copy = False else: use_copy = True c = E.Counter() min_length = options.min_aligned for match, qx, tx in iterator_psl_intervals(options): map_query2target = match.getMapQuery2Target() c.input += 1 # if no filter on qx or tx, use full segment if qx is None: qx = [(match.mQueryFrom, match.mQueryTo, 0)] elif tx is None: tx = [(match.mSbjctFrom, match.mSbjctTo, 0)] E.debug('matches in query: %s' % qx) E.debug('matches in target: %s' % tx) # if no overlap: return if not qx or not tx: c.skipped += 1 E.debug("no matches in query or target - skipped") continue for query in qx: qstart, qend, qval = query # skip elements that are too small if qend - qstart < min_length: E.debug("query too small - skipped at %s:%i-%i" % (match.mQueryId, qstart, qend)) c.skipped_small_queries += 1 continue E.debug("working on query %s:%i-%i" % (match.mQueryId, qstart, qend)) mqstart, mqend = ( map_query2target.mapRowToCol( qstart, alignlib_lite.py_RIGHT), map_query2target.mapRowToCol( qend, alignlib_lite.py_LEFT)) if match.strand == "-": qstart, qend = match.mQueryLength - \ qend, match.mQueryLength - qstart for target in tx: tstart, tend, tval = target if (tstart >= mqend or tend <= mqstart): E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % ( qstart, qend, mqstart, mqend, tstart, tend)) continue if tend - tstart < min_length: E.debug("target length too short: %i-%i - %i-%i" % ( qstart, qend, tstart, tend)) continue new = alignlib_lite.py_makeAlignmentBlocks() if use_copy: # do copy with range filter if options.loglevel >= 3: mtstart, mtend = map_query2target.mapColToRow( tstart), map_query2target.mapColToRow(tend) E.debug( ("query: %i-%i (len=%i)-> %i-%i(len=%i); " "target: %i-%i (len=%i)-> %i-%i (len=%i)") % (qstart, qend, qend - qstart, mqstart, mqend, mqend - mqstart, tstart, tend, tend - tstart, mtstart, mtend, mtend - mtstart)) alignlib_lite.py_copyAlignment( new, map_query2target, qstart, qend, tstart, tend) else: # do copy with alignment filter map_query = qval if map_query: tmp = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( tmp, map_query2target, map_query, alignlib_lite.py_RR) if options.loglevel >= 5: options.stdlog.write( "######## mapping query ###########\n") options.stdlog.write( "# %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( tmp))) else: tmp = map_query2target map_target = tval if map_target: new = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( new, tmp, map_target, alignlib_lite.py_CR) if options.loglevel >= 5: options.stdlog.write( "######## mapping target ###########\n") options.stdlog.write( "# before: %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( tmp))) options.stdlog.write( "# map : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_target))) options.stdlog.write( "# after : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) else: new = tmp if options.loglevel >= 4: E.debug("putative match with intervals: %s and %s: %i-%i" % (str(query), str(target), qstart, qend)) if options.loglevel >= 5: E.debug( "input : %s" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) E.debug("final : %s" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) if new.getLength() > 0: n = match.copy() n.fromMap(new, use_strand=True) E.info("match : %s" % (str(n))) if new.getNumAligned() > options.min_aligned: n = match.copy() n.fromMap(new, use_strand=True) options.stdout.write(str(n) + "\n") c.output += 1 else: c.discarded += 1 break else: c.nooverlap += 1 E.info("map: %s" % str(c))
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--input-filename-fasta", dest="input_filename_fasta", type="string", help="filename with reference sequence in fasta format [%default]") parser.add_option( "--input-filename-bam", dest="input_filename_bam", type="string", help="filename with aligned reads [%default]") parser.add_option( "--method", dest="methods", type="choice", action="append", choices=["add-strelka-genotype", "lift-over"], help="methods to apply [%default]") parser.add_option( "--input-filename-chain", dest="input_filename_chain", type="string", help="filename with alignment chain for lift-over [%default]") parser.add_option( "--normal-sample-regex", dest="normal_sample_regex", type="string", help="regular expression to apply to header to identify normal " "sample id [%default]") parser.add_option( "--output-filename-unmapped", dest="output_filename_unmapped", type="string", help="filename with variants that could not be lifted over [%default]") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf="-", sample_size=0.001, region_size=20, methods=[], normal_sample_regex=None, input_filename_chain=None, output_filename_unmapped=None, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: options.input_filename_vcf = args[0] vcf_in = pysam.VariantFile(options.input_filename_vcf) if "lift-over" in options.methods: if options.input_filename_chain is None: raise ValueError("--method=lift-over requires --input-filename-chain") if not os.path.exists(options.input_filename_chain): raise OSError("file {} with chain data does not exist".format( options.input_filename_chain)) E.info("reading chain from {}".format(options.input_filename_chain)) with IOTools.open_file(options.input_filename_chain) as inf: map_chain, map_contig2length = read_liftover_chain(inf) if options.input_filename_fasta: fasta = pysam.FastaFile(options.input_filename_fasta) else: fasta = None if options.input_filename_bam: bam = pysam.AlignmentFile(options.input_filename_bam) else: bam = None outf = options.stdout c = E.Counter() if "add-strelka-genotype" in options.methods: map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."} map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"} header = str(vcf_in.header).splitlines() header.insert( len(header) - 1, '##FORMAT=<ID=GT,Number=1,Type=String,Description=' '"Genotypes of reference and alternative alleles, ' 'added by CGATCore vcf2vcf.">') header = "\n".join(header) if options.normal_sample_regex: normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0] else: normal_sample = "NORMAL" is_first = True for record in vcf_in: c.input += 1 if "GT" in record.format: if is_first: outf.write(header + "\n") is_first = False outf.write(str(record)) c.has_gt += 1 continue gt_normal = map_nt2gt[record.info["NT"]] gt_tumour = record.info["SGT"] norm, tumour = gt_tumour.split("->") if gt_tumour[0] in "ACGT": alts = record.alts if alts is None: c.no_alt += 1 continue if len(record.alts) > 1: c.multi_allelic += 1 continue _map_tumour2gt = { record.alts[0]: "1", record.ref: "0"} try: gt_tumour = "/".join( sorted([_map_tumour2gt[x] for x in tumour])) except KeyError: gt_tumour = "." c.ambigous_genotype += 1 else: gt_tumour = map_tumour2gt[tumour] fields = str(record)[:-1].split("\t") # FORMAT fields[8] = ":".join(("GT", fields[8])) # SAMPLES # makes a few assumptions, fix! header_insert_normal = False if len(fields) == 11: fields[9] = ":".join((gt_normal, fields[9])) fields[10] = ":".join((gt_tumour, fields[10])) elif len(fields) == 10: header_insert_normal = True values = fields[9].split(":") fields.append(":".join((gt_tumour, fields[9]))) fields[9] = ":".join([gt_normal] + ["."] * len(values)) else: raise NotImplementedError() if is_first: if not header_insert_normal: outf.write(header + "\n") else: header = re.sub(r"\tFORMAT\t", "\tFORMAT\t%s\t" % normal_sample, header) outf.write(header + "\n") is_first = False outf.write("\t".join(fields) + "\n") c.output += 1 elif "lift-over" in options.methods: header = str(vcf_in.header).splitlines() if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) else: expected_lengths = map_contig2length # update contig names and sizes in VCF header header = [x for x in header if not x.startswith("##contig")] header[-1:-1] = ["##contig=<ID={},length={}>".format( contig, length) for contig, length in sorted(expected_lengths.items())] header.insert( len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format( options.input_filename_chain, options.input_filename_fasta)) outf.write("\n".join(header) + "\n") unmapped_contigs = set() unknown_contigs = set() trans_genotypes = str.maketrans("01", "10") if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) for contig, length in list(map_contig2length.items()): if contig in expected_lengths: if length != expected_lengths[contig]: raise ValueError( "contig lengths mismatch. For contig {} chain files " "says {}, but fasta files says {}".format( contig, length, expected_lengths[contig])) E.info("contig sizes in chain file and fasta files correspond.") if options.output_filename_unmapped: outfile_unmapped = IOTools.open_file(options.output_filename_unmapped, "w") outfile_unmapped.write("\n".join(header) + "\n") else: outfile_unmapped = None for record in vcf_in: c.input += 1 try: mm = map_chain[record.contig] except KeyError: c.skipped_unmapped_contig += 1 unmapped_contigs.add(record.contig) if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record))) continue try: m = mm.search(record.start, record.stop) except AttributeError: c.skipped_mapping_error += 1 if outfile_unmapped: outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record))) continue if len(m) == 0: c.skipped_unmapped_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record))) continue elif len(m) > 1: c.skipped_multimapping_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record))) continue m = m[0] y_contig, y_start, y_end, y_invert = m.data if y_invert: y_pos = y_end - (record.start - m.start) else: y_pos = (record.start - m.start) + y_start if fasta: try: ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper() except KeyError: c.skipped_unknown_contig += 1 unknown_contigs.add(y_contig) ref_base = None continue swap_alleles = False if ref_base: error = False if ref_base == record.ref: c.matches += 1 else: if len(record.alts) == 1: alt_base = record.alts[0] if ref_base == alt_base: swap_alleles = True c.allele_swap_variant += 1 else: c.error_mismatch_variant += 1 error = "mismatch" else: error = "multi-mismatch" c.error_multi_mismatch_variant += 1 if error: if outfile_unmapped: outfile_unmapped.write("{}\t{}".format(error, str(record))) c.skipped_error_variant += 1 continue fields = str(record)[:-1].split("\t") fields[0] = y_contig fields[1] = str(y_pos) if swap_alleles: fields[4] = alt_base fields[5] = ref_base # update genotype fields keep = False for idx in range(9, len(fields)): gt, rest = fields[idx].split(":", 1) keep = keep or "0" in gt fields[idx] = ":".join((gt.translate(trans_genotypes), rest)) # remove reference only calls if not keep: if outfile_unmapped: outfile_unmapped.write("reference_call\t{}".format(str(record))) c.skipped_allele_swap_reference += 1 continue c.output += 1 outf.write("\t".join(fields) + "\n") c.unmapped_contigs = len(unmapped_contigs) c.unknown_contigs = len(unknown_contigs) E.info(c.asTable()) if unknown_contigs: E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs)))) if unmapped_contigs: E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs)))) E.stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)