def applyThreshold(infile, fasta, threshold, max_distance=0): '''apply threshold to a wig file writing a bed-formatted file as output.''' c = E.Counter() for contig, size in fasta.getContigSizes(with_synonyms=False).items(): c.contigs += 1 E.debug("processing %s" % contig) last_start, last_end = -1, 0 for start, end, value in block_iterator(infile, contig, size): d = start - last_end if (d > 0 or value < threshold): if last_start >= 0: yield contig, last_start, last_end c.intervals += 1 last_start = -1 elif last_start < 0 and value >= threshold: last_start = start last_end = end if last_start >= 0: yield contig, last_start, end c.intervals += 1 c.output += 1 E.info(str(c))
def annotateCpGIslands(infiles, outfile): '''annotate transcript by absence/presence of CpG islands ''' cpgfile, tssfile = infiles cpg = Bed.readAndIndex(IOTools.openFile(cpgfile)) extension_upstream = PARAMS["cpg_search_upstream"] extension_downstream = PARAMS["cpg_search_downstream"] c = E.Counter() outf = IOTools.openFile(outfile, "w") outf.write( "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n") for tss in Bed.iterator(IOTools.openFile(tssfile)): c.tss_total += 1 if tss.strand == "+": start, end = tss.start - extension_upstream, tss.start + extension_downstream else: start, end = tss.end - extension_downstream, tss.end + extension_upstream try: matches = list(cpg[tss.contig].find(start, end)) except KeyError: c.promotor_without_matches += 1 continue if len(matches) == 0: c.promotor_without_matches += 1 continue c.promotor_output += 1 for match in matches: c.matches_total += 1 genome_start, genome_end, x = match l = genome_end - genome_start # get relative location of match if tss.strand == "+": relative_start = genome_start - tss.start else: relative_start = tss.end - genome_end relative_end = relative_start + l outf.write("\t".join( map(str, (tss.name, tss.strand, genome_start, genome_end, relative_start, relative_end))) + "\n") c.matches_output += 1 outf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def runAnalysis( sequences, arrangements, matrix = 'nr', qvalue_threshold = 0.05 ): if matrix == 'nr': sense_matrix = NR elif matrix == "rxrvdr": sense_matrix = RXRVDR else: raise ValueError("unknown matrix") matcher = MatcherRandomisationSequence( sense_matrix ) # find motifs in both foreground and control together results = [] for x, sequence in enumerate(sequences): result = matcher.run( sequence, arrangements, qvalue_threshold = qvalue_threshold ) for r in result: results.append( r._replace( sequence = x ) ) nsequences = len(sequences) fg_filtered = combineMotifs( results ) fg_counter = E.Counter() fg_seqs = set() co_counter = E.Counter() co_seqs = set() for x in results: fg_counter[x.arrangement] += 1 fg_seqs.add( x.sequence ) for x in fg_filtered: co_counter[x.arrangement] += 1 co_seqs.add( x.sequence ) for x in arrangements: print x, fg_counter[x], co_counter[x] print len(fg_seqs), len(co_seqs) return fg_filtered
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("script", "module"), help="type of tests to create [%default].") parser.set_defaults(method="script") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) == 0: raise ValueError( "setup_test.py requires one or more command line arguments") targetdir = os.path.dirname(__file__) counter = E.Counter() for arg in args: counter.input += 1 script_dirname, basename = os.path.split(arg) dirname = os.path.join(targetdir, basename) if os.path.exists(dirname): E.warn("%s already exists - skipping" % basename) counter.skipped += 1 continue os.mkdir(dirname) with open(os.path.join(dirname, "tests.yaml"), "w") as outf: outf.write(YAML_TEMPLATE) counter.created += 1 E.info("%s" % str(counter)) # write footer and output benchmark information. E.Stop()
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles): '''output a bed file with genomic regions with functional annotations. The regions for each gene are given in the gtf file. Each bed entry is a gene territory. Bed entries are labeled by functional annotations associated with a gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. ''' to_cluster = True territories_file = gtffile outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) # IMS: connect is not in this module. dbh needs to be passed from caller #dbh = connect() cc = dbh.cursor() outf = P.getTempFile(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write("\t".join( map(str, (contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run() outf = IOTools.openFile(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.iteritems(): outf.write("%s\t%s\n" % (term, description)) outf.close()
def checkRequirementsFromAllModules(): all_modules = sys.modules counter = E.Counter() results = [] for module in list(sys.modules.keys()): if all_modules[module] is not None: results.extend( checkRequirementsFromModule(all_modules[module], counter)) return counter, results
def ReadGene2GOFromFile(infile, synonyms={}, obsolete={}): """reads GO mappings for all go_types from a file. If synonyms is given, goids in synynoms will be translated. Terms in *obsolete* will be discarded. returns two maps: gene2go maps genes to go categories and go2info maps go categories to information. """ gene2gos = {} go2infos = {} c = E.Counter() for line in infile: if line[0] == "#": continue try: go_type, gene_id, goid, description, evidence = line[:-1].split( "\t") except ValueError as msg: raise ValueError("parsing error in line '%s': %s" % (line[:-1], msg)) if go_type == "go_type": continue c.input += 1 if goid in synonyms: c.synonyms += 1 goid = synonyms[goid] if goid in obsolete: c.obsolete += 1 continue gm = GOMatch(goid, go_type, description, evidence) gi = GOInfo(goid, go_type, description) if go_type not in gene2gos: gene2gos[go_type] = {} go2infos[go_type] = {} gene2go = gene2gos[go_type] go2info = go2infos[go_type] if gene_id not in gene2go: gene2go[gene_id] = [] gene2go[gene_id].append(gm) go2info[goid] = gi c.output += 1 E.debug("read gene2go assignments: %s" % str(c)) return gene2gos, go2infos
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py")) files.sort() ## do sth c = E.Counter() for f in files: # if f != "pipeline_ancestral_repeats.py" : continue E.debug("importing %s" % f) c.input += 1 prefix, suffix = os.path.splitext(f) dirname, basename = os.path.split(prefix) if os.path.exists(prefix + ".pyc"): os.remove(prefix + ".pyc") success = False try: __import__(basename, globals(), locals()) c.success += 1 success = True options.stdout.write("PASS %s\n" % basename) options.stdout.flush() except ImportError, msg: c.import_fail += 1 options.stdout.write("FAIL %s\n%s\n" % (basename, msg)) options.stdout.flush() traceback.print_exc() except Exception, msg: c.other_fail += 1 options.stdout.write("FAIL %s\n%s\n" % (basename, msg)) options.stdout.flush() traceback.print_exc()
def calculateSplicingIndex(bamfile, gtffile, outfile): bamfile = pysam.AlignmentFile(bamfile) counts = E.Counter() for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(gtffile))): introns = GTF.toIntronIntervals(transcript) E.debug("Gene %s (%s), Transcript: %s, %i introns" % (transcript[0].gene_id, transcript[0].contig, transcript[0].transcript_id, len(introns))) for intron in introns: reads = bamfile.fetch(reference=transcript[0].contig, start=intron[0], end=intron[1]) for read in reads: if 'N' in read.cigarstring: blocks = read.get_blocks() starts, ends = zip(*blocks) if intron[0] in ends and intron[1] in starts: counts["Exon_Exon"] += 1 else: counts["spliced_uncounted"] += 1 elif (read.reference_start <= intron[0] - 3 and read.reference_end >= intron[0] + 3): if transcript[0].strand == "+": counts["Exon_Intron"] += 1 else: counts["Intron_Exon"] += 1 elif (read.reference_start <= intron[1] - 3 and read.reference_end >= intron[1] + 3): if transcript[0].strand == "+": counts["Intron_Exon"] += 1 else: counts["Exon_Intron"] += 1 else: counts["unspliced_uncounted"] += 1 E.debug("Done, counts are: " + str(counts)) header = [ "Exon_Exon", "Exon_Intron", "Intron_Exon", "spliced_uncounted", "unspliced_uncounted" ] with IOTools.openFile(outfile, "w") as outf: outf.write("\t".join(header) + "\n") outf.write("\t".join(map(str, [counts[col] for col in header])) + "\n")
def imputeGO(infile_go, infile_paths, outfile): '''impute GO accessions. Infile is a file with GO assocations and a file with paths of term to ancester (see go2fmt.pl). ''' c = E.Counter() term2ancestors = collections.defaultdict(set) with IOTools.openFile(infile_paths) as inf: for line in inf: parts = line[:-1].split() term = parts[0] ancestors = [parts[x] for x in range(2, len(parts), 2)] # there can be multiple paths term2ancestors[term].update(ancestors) goid2description = {} gene2goids = collections.defaultdict(list) goid2type = {} with IOTools.openFile(infile_go) as inf: for line in inf: if line.startswith("go_type"): continue go_type, gene_id, goid, description, evidence = line[:-1].split( "\t") gene2goids[gene_id].append(goid) goid2description[goid] = description goid2type[goid] = go_type outf = IOTools.openFile(outfile, "w ") for gene_id, in_goids in gene2goids.iteritems(): c.genes += 1 out_goids = set(in_goids) for goid in in_goids: out_goids.update(term2ancestors[goid]) if len(in_goids) != len(out_goids): c.increased += 1 else: c.complete += 1 for goid in out_goids: outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid, goid2description.get(goid, ""), "NA")) + "\n") c.assocations += 1 outf.close() E.info("%s" % str(c))
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = PARAMS.get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = IOTools.openFile(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = IOTools.openFile(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = IOTools.zapFile(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write("%s\t%s\t%s\t%s\n" % ( fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) E.info("zapped: %s" % (c)) outfile.close() return c
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run, do not delete any files [%default]") parser.set_defaults(dry_run=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filenames = args c = E.Counter() for filename in filenames: c.checked += 1 if os.path.exists(filename + ".log"): if IOTools.isComplete(filename + ".log"): c.complete += 1 continue if IOTools.isComplete(filename): c.complete += 1 continue c.incomplete += 1 E.info('deleting %s' % filename) if options.dry_run: continue os.unlink(filename) c.deleted += 1 E.info(c) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--keep-header", dest="keep_header", type="int", help="randomize, but keep header in place [%default]") parser.set_defaults(keep_header=0) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) inf = options.stdin outf = options.stdout c = E.Counter() for x in range(options.keep_header): c.header += 1 outf.write(inf.readline()) lines = inf.readlines() c.lines_input = len(lines) random.shuffle(lines) for line in lines: outf.write(line) c.lines_output = len(lines) E.info(c) # write footer and output benchmark information. E.Stop()
def buildMRBed(infile, outfile): '''output bed6 file with methylated regions. All regions are output, even the insignificant ones. The score is the log fold change. ''' outf = IOTools.openFile(outfile, "w") c = E.Counter() for row in csv.DictReader(IOTools.openFile(infile), dialect="excel-tab"): c.input += 1 contig, start, end = re.match("(.*):(\d+)-(\d+)", row["interval_id"]).groups() c.output += 1 outf.write("\t".join((contig, start, end, str(c.input), row["lfold"])) + "\n") outf.close() E.info("%s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--exons-file", "--gtf-file", dest="filename_exons", type="string", metavar="gtf", help="gtf formatted file with non-overlapping exon " "locations (required). [%default]") parser.set_defaults( filename_exons=None, read_length=200, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) exons = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.filename_exons))) pysam_in = pysam.AlignmentFile("-", "rb") nspliced = 0 nspliced_ignored = 0 nspliced_nooverlap = 0 nspliced_halfoverlap = 0 nspliced_bothoverlap = 0 nspliced_overrun = [0] * 2 * (options.read_length + 10) nspliced_exact = 0 nspliced_inexact = 0 nunspliced = 0 nunspliced_overlap = 0 nunspliced_ignored = 0 nunspliced_nooverlap = 0 nunspliced_overrun = [0] * (options.read_length + 10) overrun_offset = options.read_length + 10 ninput = 0 nunmapped = 0 c = E.Counter() def _splice_overrun(start, end, overlap): '''return splicesite over/underrun. positive values: overrun negative values: underrun 0: no over/underrun ''' exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) if start <= exon_start and end > exon_start: # overrun at start or match r = exon_start - start elif start < exon_end and end >= exon_end: # overrun at end or match r = end - exon_end else: # underrun - distance to closest exon boundary r = -min(start - exon_start, exon_end - end) return r for read in pysam_in: ninput += 1 if read.is_unmapped: nunmapped += 1 continue # check for BAM_CREF_SKIP code in cigar string cigar = read.cigar is_spliced = 3 in [x[0] for x in cigar] contig = pysam_in.getrname(read.tid) start = read.pos end = read.aend if is_spliced: # count both ends nspliced += 1 if len(cigar) != 3: nspliced_ignored += 1 continue start5, end5 = start, start + cigar[0][1] start3, end3 = end - cigar[2][1], end try: overlap3 = list(exons.get(contig, start3, end3)) overlap5 = list(exons.get(contig, start5, end5)) except KeyError: overlap3 = overlap5 = [] ovl3 = len(overlap3) ovl5 = len(overlap5) o3 = o5 = None if not ovl3 and not ovl5: nspliced_nooverlap += 1 elif ovl3 and not ovl5: nspliced_halfoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) elif ovl5 and not ovl3: nspliced_halfoverlap += 1 o5 = _splice_overrun(start5, end5, overlap5) else: # both overlap nspliced_bothoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) o5 = _splice_overrun(start5, end5, overlap5) if o3 is not None: if o3 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o3)] += 1 if o5 is not None: if o5 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o5)] += 1 else: nunspliced += 1 try: overlap = list(exons.get(contig, start, end)) except KeyError: overlap = [] if len(overlap) == 0: nunspliced_nooverlap += 1 elif len(overlap) >= 1: nunspliced_overlap += 1 # multiple overlap - merge exons (usually: small introns) exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) ostart = max(0, exon_start - start) oend = max(0, end - exon_end) o = min(end, exon_end) - max(start, exon_start) overrun = ostart + oend nunspliced_overrun[overrun] += 1 # output histograms outfile = E.openOutputFile("overrun") outfile.write( "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n" ) _nspliced_overrun = nspliced_overrun[overrun_offset:] _nspliced_underrun = nspliced_overrun[:overrun_offset + 1] _nspliced_underrun.reverse() for x, v in enumerate( zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)): outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v)))) outfile.close() # output summary # convert to counter c.input = ninput c.unmapped = nunmapped c.mapped = ninput - nunmapped c.unspliced = nunspliced c.unspliced_nooverlap = nunspliced_nooverlap c.unspliced_nooverrun = nunspliced_overrun[0] c.unspliced_overlap = nunspliced_overlap c.unspliced_overrun = sum(nunspliced_overrun[1:]) c.spliced = nspliced c.spliced_nooverlap = nspliced_nooverlap c.spliced_halfoverlap = nspliced_halfoverlap c.spliced_bothoverlap = nspliced_bothoverlap c.spliced_exact = nspliced_exact c.spliced_inexact = nspliced_inexact c.spliced_ignored = nspliced_ignored c.spliced_underrun = sum(_nspliced_underrun[1:]) c.spliced_overrun = sum(_nspliced_overrun[1:]) outfile = options.stdout outfile.write("category\tcounts\n") for k, v in sorted(c.items()): outfile.write("%s\t%i\n" % (k, v)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--database", dest="database", type="string", help="bioconductor database to use [default=%default].") parser.add_option("-m", "--mapping", dest="database", type="string", help="bioconductor mapping to use [default=%default].") parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with the gene set in gtf format [default=%default].") parser.set_defaults( database="mouse4302.db", mapping="ENSEMBL", filename_gtf=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) prefix = options.database[:-len(".db")] mapping_probeset2gene = prefix + options.mapping mapping_probeset2loc = prefix + "CHRLOC" probeset2gene = getProbeset2Gene(database=options.database, ) probeset2location = getProbeset2Location(database=options.database, ) # gtf = GTF.readAndIndex( # GTF.iterator( IOTools.openFile( options.filename_gtf ) ) ) counts = E.Counter() outfile_notfound = open("notfound.table", "w") options.stdout.write("probeset_id\tgene_id\tngenes\n") for probeset, locations in probeset2location.iteritems(): counts.probesets += 1 gene_ids = probeset2gene[probeset] if len(gene_ids) == 0: counts.notfound += 1 continue for gene_id in gene_ids: options.stdout.write("%s\t%s\t%i\n" % (probeset, gene_id, len(gene_ids))) counts.output += 1 E.info("%s" % str(counts)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts", "taxa-counts", "individual"), help="summarise the taxa counts - no. phyla etc") parser.add_option("--output-map", dest="output_map", action="store_true", help="ouput map of taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.output_map: found = [] options.stdout.write("""Domain\t \ kingdom\t \ phylum\t \ class\t \ order\t \ family\t \ genus\t \ species\n""") # only output the mapping file - do not continue # summarise regardless of the specified options for lca in LCA.iterate(options.stdin): # if bacteria or archaea the kingdom will # be the domain if lca.domain == "Bacteria" or lca.domain == "Archaea": kingdom = lca.domain else: kingdom = lca.kingdom hierarchy = [ lca.domain, kingdom, lca.phylum, lca._class, lca.order, lca.family, lca.genus, lca.species ] if hierarchy in found: continue else: found.append(hierarchy) options.stdout.write("\t".join(hierarchy) + "\n") return if options.summarise == "level-counts": level_counts = collections.defaultdict(set) total = 0 nreads_domain = 0 nreads_kingdom = 0 nreads_kingdom_plus = 0 nreads_phylum = 0 nreads_phylum_plus = 0 nreads_class = 0 nreads_class_plus = 0 nreads_order = 0 nreads_order_plus = 0 nreads_family = 0 nreads_family_plus = 0 nreads_genus = 0 nreads_genus_plus = 0 nreads_species = 0 nreads_species_plus = 0 nreads_subspecies = 0 nreads_subspecies_plus = 0 c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": nreads_domain += 1 level_counts["domain"].add(lca.domain) else: c.kingdom_unmapped += 1 if lca.kingdom != "NA": nreads_kingdom += 1 level_counts["kingdom"].add(lca.kingdom) else: c.kingdom_unmapped += 1 if lca.kingdom_plus != "NA": nreads_kingdom_plus += 1 level_counts["kingdom+"].add(lca.kingdom_plus) else: c.kingdom_plus_unmapped += 1 if lca.phylum != "NA": nreads_phylum += 1 level_counts["phylum"].add(lca.phylum) else: c.phylum_unmapped += 1 if lca.phylum_plus != "NA": nreads_phylum_plus += 1 level_counts["phylum+"].add(lca.phylum_plus) else: c.phylum_plus_unmapped += 1 if lca._class != "NA": nreads_class += 1 level_counts["class"].add(lca._class) else: c.class_unmapped += 1 if lca._class_plus != "NA": nreads_class_plus += 1 level_counts["class+"].add(lca._class_plus) else: c.class_plus_unmapped += 1 if lca.order != "NA": nreads_order += 1 level_counts["order"].add(lca.order) else: c.order_unmapped += 1 if lca.order_plus != "NA": nreads_order_plus += 1 level_counts["order+"].add(lca.order_plus) else: c.order_plus_unmapped += 1 if lca.family != "NA": nreads_family += 1 level_counts["family"].add(lca.family) else: c.family_unmapped += 1 if lca.family != "NA": nreads_family_plus == 1 level_counts["family+"].add(lca.family_plus) else: c.family_plus_unmapped += 1 if lca.genus != "NA": nreads_genus += 1 level_counts["genus"].add(lca.genus) else: c.genus_unmapped += 1 if lca.genus_plus != "NA": nreads_genus_plus == 1 level_counts["genus+"].add(lca.genus_plus) else: c.genus_plus_unmapped += 1 if lca.species != "NA": nreads_species += 1 level_counts["species"].add(lca.species) else: c.species_unmapped += 1 if lca.species_plus != "NA": nreads_species_plus += 1 level_counts["species+"].add(lca.species_plus) else: c.species_plus_unmapped += 1 # removed subspecies mapping for the time # being # if lca.subspecies != "NA": # nreads_subspecies += 1 # level_counts["subspecies"].add(lca.subspecies) # else: # c.subspecies_unmapped += 1 # if lca.subspecies_plus != "NA": # nreads_subspecies_plus += 1 # level_counts["subspecies+"].add(lca.subspecies_plus) # else: # c.subspecies_plus_unmapped += 1 options.stdout.write("\t".join([ "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+" ]) + "\n") options.stdout.write("\t".join( map(str, [ len(level_counts["domain"]), len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(level_counts["species"]), len(level_counts["species+"]), nreads_domain, nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus ])) + "\n") elif options.summarise == "taxa-counts": unmapped = collections.defaultdict(int) total = 0 taxa_counts = { "domain": collections.defaultdict(int), "kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int) } c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": taxa_counts["domain"][lca.domain] += 1 else: c.kingdom_unmapped += 1 unmapped["domain"] += 1 if lca.kingdom != "NA": taxa_counts["kingdom"][lca.kingdom] += 1 else: c.kingdom_unmapped += 1 unmapped["kingdom"] += 1 if lca.kingdom_plus != "NA": taxa_counts["kingdom+"][lca.kingdom_plus] += 1 else: c.kingdom_plus_unmapped += 1 unmapped["kingdom+"] += 1 if lca.phylum != "NA": taxa_counts["phylum"][lca.phylum] += 1 else: c.phylum_unmapped += 1 unmapped["phylum"] += 1 if lca.phylum_plus != "NA": taxa_counts["phylum+"][lca.phylum_plus] += 1 else: c.phylum_plus_unmapped += 1 unmapped["phylum+"] += 1 if lca._class != "NA": taxa_counts["class"][lca._class] += 1 else: c.class_unmapped += 1 unmapped["class"] += 1 if lca._class_plus != "NA": taxa_counts["class+"][lca._class_plus] += 1 else: c.class_plus_unmapped += 1 unmapped["class+"] += 1 if lca.order != "NA": taxa_counts["order"][lca.order] += 1 else: c.order_unmapped += 1 unmapped["order"] += 1 if lca.order_plus != "NA": taxa_counts["order+"][lca.order_plus] += 1 else: c.order_plus_unmapped += 1 unmapped["order+"] += 1 if lca.family != "NA": taxa_counts["family"][lca.family] += 1 else: c.family_unmapped += 1 unmapped["family"] += 1 if lca.family_plus != "NA": taxa_counts["family+"][lca.family_plus] += 1 else: c.family_plus_unmapped += 1 unmapped["family+"] += 1 if lca.genus != "NA": taxa_counts["genus"][lca.genus] += 1 else: c.genus_unmapped += 1 unmapped["genus"] += 1 if lca.genus_plus != "NA": taxa_counts["genus+"][lca.genus_plus] += 1 else: c.genus_plus_unmapped += 1 unmapped["genus+"] += 1 if lca.species != "NA": taxa_counts["species"][lca.species] += 1 else: c.species_unmapped += 1 unmapped["species"] += 1 if lca.species_plus != "NA": taxa_counts["species+"][lca.species_plus] += 1 else: c.species_plus_unmapped += 1 unmapped["species+"] += 1 options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n") for level, taxa_count in sorted(taxa_counts.items()): total_level = total - unmapped[level] for taxa, count in sorted(taxa_count.items()): options.stdout.write("\t".join([ level, taxa, str(count), "{:.8}".format(float(count) / total_level), "{:.8}". format(float(count) / (float(total_level) / 1000000)) ]) + "\n") E.info(c) elif options.summarise == "individual": # each read is output with its respective # taxon assignments options.stdout.write("\t".join([ "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order", "order+", "family", "family+", "genus", "genus+", "species", "species+" ]) + "\n") for lca in LCA.iterate(options.stdin): options.stdout.write("\t".join([ lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order, lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus ]) + "\n") # write footer and output benchmark information. E.Stop()
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize)) # AString.AString( "a").itemsize )) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) annotations[contig] = array.array("c", default_code * size) E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.openOutputFile("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError, msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c)
def createGOFromGeneOntology(infile, outfile): """get GO assignments from Geneontology.org GO terms are mapped to ensembl gene names via uniprot identifiers. Configuration ------------- geneontology_file Filename on geneontology database, e.g., gene_association.goa_human.gz database_name Pipeline database name Arguments --------- infile : string Unused outfile : string Output filename """ filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz") if not os.path.exists(filename): statement = ''' wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD ''' P.run() # see http://www.geneontology.org/gene-associations/readme/goa.README Data = collections.namedtuple( "Data", "db db_object_id db_object_symbol qualifier goid dbreference evidence " " with_id aspect " " db_object_name synonym db_object_type " " taxon_id date assigned_by " " annotation_extension" " gene_product_form_id") dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() map_uniprot2ensembl = dict( cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info"). fetchall()) map_goid2description = dict( cc.execute("SELECT DISTINCT go_id, description FROM go_assignments"). fetchall()) aspect2name = { "P": "biol_process", "F": "mol_function", "C": "cell_location" } c = E.Counter() found_uniprot, found_genes, notfound_uniprot = set(), set(), set() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for line in IOTools.openFile(filename): if line.startswith("!"): continue c.input += 1 data = Data._make(line[:-1].split("\t")) if data.db_object_symbol in map_uniprot2ensembl: gene_id = map_uniprot2ensembl[data.db_object_symbol] found_uniprot.add(data.db_object_symbol) found_genes.add(gene_id) outf.write( "%s\t%s\t%s\t%s\t%s\n" % (aspect2name[data.aspect], gene_id, data.goid, map_goid2description.get(data.goid, ""), data.evidence)) c.output += 1 else: c.notfound += 1 notfound_uniprot.add(data.db_object_symbol) c.found_genes = len(found_genes) c.found_uniprot = len(found_uniprot) c.notfound_uniprot = len(notfound_uniprot) E.info("%s" % str(c)) E.info("not found=%s" % str(notfound_uniprot)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None].") parser.add_option( "-e", "--gff-file", dest="gff_file", type="string", help="gff file to use for getting contig sizes.") parser.add_option( "-f", "--fixed-width-windows", dest="fixed_width_windows", type="string", help="fixed width windows. Supply the window size as a " "parameter. Optionally supply an offset.") parser.set_defaults( genome_file=None, remove_regex=None, fixed_windows=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.remove_regex: remove_regex = re.compile(options.remove_regex) else: remove_regex = None if options.fixed_width_windows: v = map(int, options.fixed_width_windows.split(",")) if len(v) == 2: window_size, window_increment = v elif len(v) == 1: window_size, window_increment = v[0], v[0] else: raise ValueError( "could not parse window size '%s': should be size[,increment]" % options.fixed_width_windows) if options.gff_file: infile = open(options.gff_file, "r") gff = GTF.readFromFile(infile) infile.close() for g in gff: try: map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end) except ValueError: map_contig2size[g.mName] = g.end else: gff = None if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) map_contig2size = fasta.getContigSizes(with_synonyms=False) else: fasta = None if map_contig2size is None: raise ValueError("no source of contig sizes supplied") # do sth counter = E.Counter() for contig, size in map_contig2size.items(): size = int(size) counter.input += 1 if remove_regex and remove_regex.search(contig): counter.skipped += 1 continue if options.fixed_width_windows: for x in range(0, size, window_increment): if x + window_size > size: continue options.stdout.write( "%s\t%i\t%i\n" % (contig, x, min(size, x + window_size))) counter.windows += 1 else: options.stdout.write("%s\t%i\t%i\n" % (contig, 0, size)) counter.windows += 1 counter.output += 1 E.info(str(counter)) # write footer and output benchmark information. E.Stop()
def main(argv=sys.argv): # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("igv", ), help="method to create plots with [%default]") parser.add_option("-d", "--snapshot-dir", dest="snapshotdir", type="string", help="directory to save snapshots in [%default]") parser.add_option("-f", "--format", dest="format", type="choice", choices=("png", "eps", "svg"), help="output file format [%default]") parser.add_option("-o", "--host", dest="host", type="string", help="host that IGV is running on [%default]") parser.add_option("-p", "--port", dest="port", type="int", help="port that IGV listens at [%default]") parser.add_option( "-e", "--extend", dest="extend", type="int", help="extend each interval by a number of bases [%default]") parser.add_option("-x", "--expand", dest="expand", type="float", help="expand each region by a certain factor [%default]") parser.set_defaults( method="igv", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) E.info("connection to session on %s:%s" % (options.host, options.port)) E.info("saving images in %s" % options.snapshotdir) igv = IGV.IGV(host=options.host, port=options.port, snapshot_dir=os.path.abspath(options.snapshotdir)) c = E.Counter() for bed in Bed.iterator(options.stdin): c.input += 1 # IGV can not deal with white-space in filenames name = re.sub("\s", "_", bed.name) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = options.extend if options.expand: d = end - start extend = max(extend, (options.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = "%s.%s" % (name, options.format) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--min-overlap", dest="min_overlap", type="int", help="minimum overlap") parser.add_option( "-w", "--pattern-window", dest="pattern_window", type="string", help= "regular expression to extract window coordinates from test id [%default]" ) parser.add_option("-i", "--invert", dest="invert", action="store_true", help="invert direction of fold change [%default]") parser.set_defaults(min_overlap=10, invert=False, pattern_window="(\S+):(\d+)-(\d+)"), # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfiles = IOTools.FilePool(options.output_filename_pattern) if options.invert: test_f = lambda l2fold: l2fold < 0 else: test_f = lambda l2fold: l2fold > 0 def read(): rx_window = re.compile(options.pattern_window) # filter any of the DESeq/EdgeR message that end up at the top of the # output file for data in IOTools.iterate(options.stdin): contig, start, end = rx_window.match(data.test_id).groups() start, end = map(int, (start, end)) yield DATA._make( (contig, start, end, data.treatment_name, float(data.treatment_mean), float(data.treatment_std), data.control_name, float(data.control_mean), float(data.control_std), float(data.pvalue), float(data.qvalue), float(data.l2fold), float(data.fold), int(data.significant), data.status, 0)) def grouper(data, distance=10): last = data.next() entries = [last] while 1: d = data.next() if d is None: break if d.contig == last.contig and d.start < last.start: raise ValueError("error not sorted by start") if ((d.contig != last.contig) or (d.start - last.end > distance) or (d.status != last.status) or (d.significant != last.significant) or (d.l2fold * last.l2fold < 0)): yield entries entries = [] entries.append(d) last = d yield entries counter = E.Counter() options.stdout.write("\t".join(DATA._fields) + "\n") # set of all sample names - used to create empty files samples = set() # need to sort by coordinate all_data = list(read()) all_data.sort(key=lambda x: (x.contig, x.start)) for group in grouper(iter(all_data), distance=options.min_overlap): start, end = group[0].start, group[-1].end assert start < end, 'start > end: %s' % str(group) n = float(len(group)) counter.input += n g = group[0] if g.l2fold < 0: l2fold = max([x.l2fold for x in group]) fold = max([x.fold for x in group]) else: l2fold = min([x.l2fold for x in group]) fold = min([x.fold for x in group]) outdata = DATA._make( (g.contig, start, end, g.treatment_name, sum([x.treatment_mean for x in group]) / n, max([x.treatment_std for x in group]), g.control_name, sum([x.control_mean for x in group]) / n, max([x.control_std for x in group]), max([x.pvalue for x in group]), max([x.qvalue for x in group]), l2fold, fold, g.significant, g.status, int(n))) samples.add(g.treatment_name) samples.add(g.control_name) if g.significant: if test_f(g.l2fold): # treatment lower methylation than control outfiles.write( g.treatment_name, "%s\t%i\t%i\t%s\t%f\n" % (g.contig, g.start, g.end, g.treatment_name, sum([x.treatment_mean for x in group]) / n)) else: outfiles.write( g.control_name, "%s\t%i\t%i\t%s\t%f\n" % (g.contig, g.start, g.end, g.control_name, sum([x.control_mean for x in group]) / n)) options.stdout.write("\t".join(map(str, outdata)) + "\n") counter.output += 1 for sample in samples: outfiles.write(sample, "") outfiles.close() E.info("%s" % counter) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('reconcile', 'merge'), help="method to apply [default=%default].") parser.add_option("-c", "--chop", dest="chop", action="store_true", help="whether or not to trim last character of " "sequence name. For example sometimes ids in the first " "file in the pair will end with \1 and the second " "with \2. If --chop is not specified " "then the results will be wrong [default=%default].") parser.add_option("-u", "--unpaired", dest="unpaired", action="store_true", help="whether or not to write out unpaired reads " "to a seperate file") parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="pattern for output files [default=%default].") parser.set_defaults( method="reconcile", chop=False, unpaired=False, output_pattern="%s.fastq.gz", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() if options.method == "reconcile": def getIds(infile): '''return ids in infile.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = l[0].split()[0] # decide if to chop read number off if options.chop: yield r[:-1] else: yield r def write(outfile, infile, take, unpaired_file=None): '''filter fastq files with ids in take.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = l[0].split()[0] if options.chop: r = r[:-1] if r not in take: if unpaired_file is None: continue else: unpaired_file.write("\n".join(l) + "\n") else: outfile.write("\n".join(l) + "\n") E.info("reading first in pair") inf1 = IOTools.openFile(fn1) ids1 = set(getIds(inf1)) E.info("reading second in pair") inf2 = IOTools.openFile(fn2) ids2 = set(getIds(inf2)) take = ids1.intersection(ids2) E.info("first pair: %i reads, second pair: %i reads, " "shared: %i reads" % (len(ids1), len(ids2), len(take))) if options.unpaired: unpaired_filename = IOTools.openFile( options.output_pattern % "unpaired", "w") else: unpaired_filename = None with IOTools.openFile(options.output_pattern % "1", "w") as outf: inf = IOTools.openFile(fn1) E.info("writing first in pair") write(outf, inf, take, unpaired_filename) with IOTools.openFile(options.output_pattern % "2", "w") as outf: inf = IOTools.openFile(fn2) E.info("writing second in pair") write(outf, inf, take, unpaired_filename) if options.unpaired: unpaired_filename.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | python %(scriptsdir)s/bed2fasta.py --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def annotateGREATDomains(iterator, fasta, options): """build great domains extend from TSS a basal region. """ gene_iterator = GTF.gene_iterator(iterator) counter = E.Counter() upstream, downstream = options.upstream, options.downstream radius = options.radius outfile = options.stdout regions = [] #################################################################### # define basal regions for each gene # take all basal regions per transcript and merge them # Thus, the basal region of a gene might be larger than the sum # of options.upstream + options.downstream for gene in gene_iterator: counter.genes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] # collect every basal region per transcript for transcript in gene: counter.transcripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) # take first/last entry start, end = min(x[0] for x in regulons), max(x[1] for x in regulons) gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "greatdomain" gtf.start, gtf.end = start, end regions.append(gtf) regions.sort(key=lambda x: (x.contig, x.start)) outf = IOTools.openFile("test.gff", "w") for x in regions: outf.write(str(x) + "\n") outf.close() #################################################################### # extend basal regions regions.sort(key=lambda x: (x.contig, x.start)) # iterate within groups of overlapping basal regions groups = list(GTF.iterator_overlaps(iter(regions))) counter.groups = len(groups) last_end = 0 reset = False for region_id, group in enumerate(groups): # collect basal intervals in group intervals = [(x.start, x.end) for x in group] def overlapsBasalRegion(pos): for start, end in intervals: if start == pos or end == pos: continue if start <= pos < end: return True if start > pos: return False return False # deal with boundary cases - end of contig if region_id < len(groups) - 1: nxt = groups[region_id + 1] if nxt[0].contig == group[0].contig: next_start = min([x.start for x in nxt]) else: next_start = fasta.getLength(group[0].contig) reset = True else: next_start = fasta.getLength(group[0].contig) reset = True # last_end = basal extension of previous group # next_start = basal_extension of next group # extend region to previous/next group always extend # dowstream, but upstream only extend if basal region of an # interval is not already overlapping another basal region # within the group save_end = 0 for gtf in group: save_end = max(save_end, gtf.end) if gtf.strand == "+": if not overlapsBasalRegion(gtf.start): gtf.start = max(gtf.start - radius, last_end) # always extend downstream gtf.end = min(gtf.end + radius, next_start) else: # always extend downstream gtf.start = max(gtf.start - radius, last_end) if not overlapsBasalRegion(gtf.end): gtf.end = min(gtf.end + radius, next_start) outfile.write(str(gtf) + "\n") counter.regulons += 1 if len(group) > 1: counter.overlaps += len(group) else: counter.nonoverlaps += 1 if reset: last_end = 0 reset = False else: last_end = save_end E.info("%s" % str(counter))
def buildPolyphenInput(infiles, outfile): '''build polyphen input file. SNPS across all species are aggregated into a single file to avoid multiple submissions for the same variant. Mapping to Uniprot ids was not successful - 40% of the SNPs would have been lost. Hence I map to ensembl protein identifiers. Note that the sequence file is then to be submitted to POLYPHEN as well. Note that this method outputs 1-based coordinates for polyphen, while the coordinates in the .map file are still 0-based. SNPs are assigned a snp_id and a locus_id. The snp_id refers to the SNP within a peptide sequence while the locus_id refers to the genomic location. If there are alternative transcripts overlapping a SNP, the same SNP will get two snp_ids, but the same locus_id. As the peptide background might be different for the same SNP depending on the transcript, its effect needs to be predicted twice. ''' statement = '''SELECT transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, snp_position FROM %(table)s_cds WHERE variant_code = '=' AND code = 'N' ''' dbhandle = connect() cc = dbhandle.cursor() infiles.sort() # ensembl mapping map_transcript2id = dict( cc.execute("SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL").fetchall()) total_counts = E.Counter() notfound, found = set(), set() outf_map = open(outfile + ".map", "w") outf_map.write( "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n") outf = open(outfile, "w") snps = {} locus_ids = {} for infile in infiles: table = P.toTable(infile) track = table[:-len("_effects")] print(statement % locals()) cc.execute(statement % locals()) counts = E.Counter() snp_id = 0 for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc: counts.input += 1 if transcript_id not in map_transcript2id: notfound.add(transcript_id) counts.not_found += 1 continue if "," in variant_codons: counts.heterozygous += 1 continue for phase in range(0, 3): if orig_na[phase].lower() != variant_na[phase].lower(): break pid = map_transcript2id[transcript_id] # one-based coordinates peptide_pos = int(math.floor(cds_start / 3.0)) + 1 key = "%s-%i-%s" % (pid, peptide_pos, variant_codons) if key in snps: snp_id = snps[key] else: snp_id = len(snps) snps[key] = snp_id outf.write("snp%010i\t%s\t%i\t%s\t%s\n" % (snp_id, pid, peptide_pos, orig_codons, variant_codons, )) counts.output += 1 locus_key = "%s-%i-%s" % (contig, pos, variant_codons) if locus_key not in locus_ids: locus_ids[locus_key] = len(locus_ids) # use 0-based coordinates throughout, including peptide pos outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" % (snp_id, track, transcript_id, pid, peptide_pos - 1, locus_ids[locus_key], contig, pos, phase)) found.add(transcript_id) total_counts += counts E.info("%s: %s" % (table, str(counts))) outf.close() outf_map.close() E.info("%s: transcripts: %s found, %i not found" % (table, len(found), len(notfound))) E.info("total=%s, snp_ids=%i, locus_ids=%i" % (str(total_counts), len(snps), len(locus_ids))) if notfound: E.warn("%i transcripts had SNPS that were ignored because there was no uniprot accession" % len(notfound)) E.warn("notfound: %s" % ",".join(notfound)) statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--exclusive-overlap", dest="exclusive", action="store_true", help="Intervals reported will be merged across the " "positive set and do not overlap any interval in any of the " "other sets [default=%default].") parser.add_option( "-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename " "to an id [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("merged-combinations", "unmerged-combinations"), help="method to perform [default=%default]") parser.set_defaults( pattern_id="(.*).bed.gz", exclusive=False, method="merged-combinations", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("at least two arguments required") tags, bedfiles = [], [] for infile in args: bedfiles.append(pysam.Tabixfile(infile, "r")) tags.append(re.search(options.pattern_id, infile).groups()[0]) indices = list(range(len(bedfiles))) is_exclusive = options.exclusive if options.method == "merged-combinations": if is_exclusive: start = 1 else: start = 2 options.stdout.write("combination\twithout\tcounts\n") for ncombinants in range(start, len(bedfiles) + 1): for combination in itertools.combinations(indices, ncombinants): other = [x for x in indices if x not in combination] tag = ":".join([tags[x] for x in combination]) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) other_bed = [bedfiles[x] for x in other] outf = IOTools.openFile( E.getOutputFile(tag), "w", create_dir=True) c = E.Counter() for contig, start, end in combineMergedIntervals( [bedfiles[x] for x in combination]): c.found += 1 if is_exclusive and isContainedInOne(contig, start, end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\t%i\t%i\n" % (contig, start, end)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%i\n" % ( ":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) elif options.method == "unmerged-combinations": options.stdout.write("track\tcombination\twithout\tcounts\n") for foreground in indices: start = 0 background = [x for x in indices if x != foreground] for ncombinants in range(0, len(background) + 1): for combination in itertools.combinations(background, ncombinants): other = [x for x in background if x not in combination] combination_bed = [bedfiles[x] for x in combination] other_bed = [bedfiles[x] for x in other] tag = ":".join([tags[foreground]] + [tags[x] for x in combination]) E.debug("fg=%i, combination=%s, other=%s" % (foreground, combination, other)) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) outf = IOTools.openFile( E.getOutputFile(tag), "w", create_dir=True) c = E.Counter() for bed in combineUnmergedIntervals( bedfiles[foreground], combination_bed): c.found += 1 if is_exclusive and isContainedInOne(bed.contig, bed.start, bed.end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\n" % str(bed)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%s\t%i\n" % ( tags[foreground], ":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) E.Stop()
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False): '''get refseq gene set from UCSC database and save as :term:`gtf` formatted file. Matches to ``chr_random`` are ignored (as does ENSEMBL). Note that this approach does not work as a gene set, as refseq maps are not real gene builds and unalignable parts cause differences that are not reconcilable. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`gtf` format. The filename aims to be close to the ENSEMBL gtf format. remove_duplicate : bool If True, duplicate mappings are removed. ''' duplicates = set() if remove_duplicates: cc = dbhandle.cursor() cc.execute("""SELECT name, COUNT(*) AS c FROM refGene WHERE chrom NOT LIKE '%_random' GROUP BY name HAVING c > 1""") duplicates = set([x[0] for x in cc.fetchall()]) E.info("removing %i duplicates" % len(duplicates)) # these are forward strand coordinates statement = ''' SELECT gene.name, link.geneName, link.name, gene.name2, product, protAcc, chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, exonFrames FROM refGene as gene, refLink as link WHERE gene.name = link.mrnaAcc AND chrom NOT LIKE '%_random' ORDER by chrom, cdsStart ''' outf = IOTools.openFile(outfile, "w") cc = dbhandle.cursor() cc.execute(statement) SQLResult = collections.namedtuple( 'Result', '''transcript_id, gene_id, gene_name, gene_id2, description, protein_id, contig, strand, start, end, nexons, starts, ends, frames''') counts = E.Counter() counts.duplicates = len(duplicates) for r in map(SQLResult._make, cc.fetchall()): if r.transcript_id in duplicates: continue starts = map(int, r.starts.split(",")[:-1]) ends = map(int, r.ends.split(",")[:-1]) frames = map(int, r.frames.split(",")[:-1]) gtf = GTF.Entry() gtf.contig = r.contig gtf.source = "protein_coding" gtf.strand = r.strand gtf.gene_id = r.gene_id gtf.transcript_id = r.transcript_id gtf.addAttribute("protein_id", r.protein_id) gtf.addAttribute("transcript_name", r.transcript_id) gtf.addAttribute("gene_name", r.gene_name) assert len(starts) == len(ends) == len(frames) if gtf.strand == "-": starts.reverse() ends.reverse() frames.reverse() counts.transcripts += 1 i = 0 for start, end, frame in zip(starts, ends, frames): gtf.feature = "exon" counts.exons += 1 i += 1 gtf.addAttribute("exon_number", i) # frame of utr exons is set to -1 in UCSC gtf.start, gtf.end, gtf.frame = start, end, "." outf.write("%s\n" % str(gtf)) cds_start, cds_end = max(r.start, start), min(r.end, end) if cds_start >= cds_end: # UTR exons have no CDS # do not expect any in UCSC continue gtf.feature = "CDS" # invert the frame frame = (3 - frame % 3) % 3 gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame outf.write("%s\n" % str(gtf)) outf.close() E.info("%s" % str(counts))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--source", dest="source_directory", type="string", default=False, help="The directory in which data" "files are held [%default]") parser.add_option("-d", "--dest", dest="dest_directory", type="string", default=False, help="The directory in which links" "are created [%default]") parser.set_defaults(source_directory=None, dest_directory=".") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read a map of input files to links with sanity checks map_filename2link = {} links = set() for line in options.stdin: if line.startswith("#"): continue # ignore header if line.startswith("source"): continue filename, link = line[:-1].split()[:2] if filename in map_filename2link: raise ValueError("duplicate filename '%s' " % filename) if link in links: raise ValueError("duplicate link '%s' " % link) map_filename2link[filename] = link links.add(link) counter = E.Counter() counter.input = len(map_filename2link) def _createLink(src, dest, counter): src = os.path.abspath(src) dest = os.path.abspath(os.path.join(options.dest_directory, dest)) if os.path.exists(dest): E.warn("existing symlink %s" % dest) counter.link_exists += 1 elif not os.path.exists(src): counter.file_not_found += 1 E.warn("did not find %s" % src) else: try: os.symlink(src, dest) counter.success += 1 except OSError: pass if not options.source_directory: # no source directory given, filenames must have complete path for filename, link in list(map_filename2link.items()): _createLink(filename, link, counter) else: # walk through directory hierchy and create links # for files matching filenames in map_filename2link found = set() for dirName, subdirList, fileList in os.walk(options.source_directory): for f in fileList: if f in map_filename2link: if f in found: E.warn("found multiple files with " "the same name %s" % f) else: _createLink(os.path.join(dirName, f), map_filename2link[f], counter) found.add(f) else: E.info("Filename %s not in map" % f) notfound = set(map_filename2link.keys()).difference(found) counter.notfound = len(notfound) if notfound: E.warn("did not find %i files: %s" % (len(notfound), str(notfound))) E.info(counter) # write footer and output benchmark information E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-d", "--directory", dest="directory", type="string", help="supply help") parser.set_defaults( directory=".", # using Andreas' repository in order to delay # changes to main repository in /ifs/devel/cgat basename="/ifs/devel/andreas/cgat/", ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # collect a list of python modules module_names = [ os.path.basename(x)[:-3] for x in glob.glob(options.basename + "CGAT/*.py") ] pipeline_names = [ os.path.basename(x)[:-3] for x in glob.glob(options.basename + "CGATPipelines/Pipeline*.py") ] if options.directory == "CGAT": files_to_update = glob.glob( os.path.join( basename, "scripts/*.py" )) +\ glob.glob( os.path.join( options.basename, "scripts/*.pyx" )) +\ glob.glob( os.path.join( options.basename, "CGATPipelines/pipeline_*.py" ) )+\ glob.glob( os.path.join( options.basename, "CGATPipelines/Pipeline*.py" ) ) else: files_to_update = glob.glob(os.path.join(options.directory, "*.py")) E.info("updating %i python scripts/modules" % len(files_to_update)) counter = E.Counter() for script in files_to_update: counter.input += 1 print "working on", script inf = open(script) lines = inf.readlines() inf.close() # create a backup copy shutil.move(script, script + ".bak") outf = open(script, "w") updated = False for line in lines: if re.match("import ", line): if " as " in line: try: module, name = re.match("import (\S+) as (\S+)\s*$", line).groups() except AttributeError as msg: raise AttributeError( "parsing error in line '%s': '%s'" % (line[:-1], msg)) if module in module_names: line = "import CGAT.%s as %s\n" % (module, name) updated = True else: try: modules = re.match("import (.+)", line).groups()[0] except AttributeError as msg: raise AttributeError( "parsing error in line '%s': '%s'" % (line[:-1], msg)) modules = [x.strip() for x in modules.split(",")] for module in modules: if module in module_names: outf.write("import CGAT.%s as %s\n" % (module, module)) updated = True elif module in pipeline_names: outf.write("import CGATPipelines.%s as %s\n" % (module, module)) updated = True else: outf.write("import %s\n" % module) continue outf.write(line) outf.close() if updated: counter.updated += 1 E.info("summary: %s" % str(counter)) ## write footer and output benchmark information. E.Stop()