def runRegexMotifSearch(infiles, outfile): '''run a regular expression search on sequences. compute counts. ''' motif = "[AG]G[GT]T[CG]A" reverse_motif = "T[GC]A[CA]C[TC]" controlfile, dbfile = infiles if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) motifs = [] for x in range(0, 15): motifs.append( ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE))) for x in range(0, 15): motifs.append(("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE))) db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs) control_positions = Motifs.countMotifs(IOTools.openFile(controlfile, "r"), motifs) db_counts, control_counts = Motifs.getCounts( db_positions), Motifs.getCounts(control_positions) db_seqcounts, control_seqcounts = Motifs.getOccurances( db_positions), Motifs.getCounts(control_positions) ndb, ncontrol = len(db_positions), len(control_positions) outf = IOTools.openFile(outfile, "w") outf.write( "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n" ) for motif, pattern in motifs: try: fold = float(db_seqcounts[motif]) * ncontrol / ( ndb * control_seqcounts[motif]) except ZeroDivisionError: fold = 0 outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % \ (motif, db_counts[motif], control_counts[motif], db_seqcounts[motif], IOTools.prettyPercent( db_seqcounts[motif], ndb), control_seqcounts[motif], IOTools.prettyPercent( control_seqcounts[motif], ncontrol), fold) )
def runRegexMotifSearch(infiles, outfile): '''run a regular expression search on sequences. compute counts. ''' motif = "[AG]G[GT]T[CG]A" reverse_motif = "T[GC]A[CA]C[TC]" controlfile, dbfile = infiles if not os.path.exists(controlfile): raise P.PipelineError( "control file %s for %s does not exist" % (controlfile, dbfile)) motifs = [] for x in range(0, 15): motifs.append( ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE))) for x in range(0, 15): motifs.append( ("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE))) db_positions = Motifs.countMotifs(IOTools.openFile(dbfile, "r"), motifs) control_positions = Motifs.countMotifs( IOTools.openFile(controlfile, "r"), motifs) db_counts, control_counts = Motifs.getCounts( db_positions), Motifs.getCounts(control_positions) db_seqcounts, control_seqcounts = Motifs.getOccurances( db_positions), Motifs.getCounts(control_positions) ndb, ncontrol = len(db_positions), len(control_positions) outf = IOTools.openFile(outfile, "w") outf.write( "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n") for motif, pattern in motifs: try: fold = float(db_seqcounts[motif]) * \ ncontrol / (ndb * control_seqcounts[motif]) except ZeroDivisionError: fold = 0 outf.write("%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % (motif, db_counts[motif], control_counts[motif], db_seqcounts[motif], IOTools.prettyPercent(db_seqcounts[motif], ndb), control_seqcounts[motif], IOTools.prettyPercent(control_seqcounts[motif], ncontrol), fold))
def summary(infile, outfile): """compute mapping stats.""" def _getfiles(filename): track = outfile[: -len(".mapped.summary")] if track.endswith(".merged"): xtrack = track[: -len(".merged")] finput = "%s.psl.gz" % xtrack fmerged = "%s.transcripts.transcripts.psl" % xtrack fmapped = "%s.mapped.psl" % track else: finput = "%s.psl.gz" % track fmerged = finput fmapped = "%s.mapped.psl" % track return track, finput, fmerged, fmapped outf = open(outfile, "w") outf.write("track\tinput\tmerged\tpmerged\tmapped\tpmapped\tpoutput\n") def countPSL(filename): if filename.endswith(".gz"): i = gzip.open(filename) else: i = open(filename) ll = [x[:10] for x in i.readlines() if not x.startswith("#")] if ll[0].startswith("psLayout"): return len(ll) - 5 else: return len(ll) track, finput, fmerged, fmapped = _getfiles(outfile) ninput = countPSL(finput) # subtract header nmerged = countPSL(fmerged) - 5 nmapped = countPSL(fmapped) outf.write( "%s\t%i\t%i\t%s\t%i\t%s\t%s\n" % ( track, ninput, nmerged, IOTools.prettyPercent(nmerged, ninput), nmapped, IOTools.prettyPercent(nmapped, nmerged), IOTools.prettyPercent(nmapped, ninput), ) )
def __str__(self): """return string representation.""" return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \ (self.mSampleCountsCategory, self.mSampleCountsTotal, IOTools.prettyPercent( self.mSampleCountsCategory, self.mSampleCountsTotal), self.mBackgroundCountsCategory, self.mBackgroundCountsTotal, IOTools.prettyPercent( self.mBackgroundCountsCategory, self.mBackgroundCountsTotal), IOTools.val2str(self.mRatio), self.mPValue, self.mProbabilityOverRepresentation, self.mProbabilityUnderRepresentation)
def __str__(self): return "\t".join(map(str, ( self.mGenes1, self.mGenes2, self.mGenesOverlapping1, self.mGenesOverlapping2, self.mGenesUnique1, self.mGenesUnique2, self.mExons1, self.mExons2, self.mExonsOverlapping1, self.mExonsOverlapping2, self.mExonsUnique1, self.mExonsUnique2, self.mBases1, self.mBases2, self.mBasesOverlapping1, self.mBasesOverlapping2, self.mBasesUnique1, self.mBasesUnique2 ) ) ) + "\t" +\ "\t".join(map(lambda x: IOTools.prettyPercent(*x), ( (self.mGenesOverlapping1, self.mGenes1), (self.mGenesOverlapping2, self.mGenes2), (self.mGenesUnique1, self.mGenes1), (self.mGenesUnique2, self.mGenes2), (self.mExonsOverlapping1, self.mExons1), (self.mExonsOverlapping2, self.mExons2), (self.mExonsUnique1, self.mExons1), (self.mExonsUnique2, self.mExons2), (self.mBasesOverlapping1, self.mBases1), (self.mBasesOverlapping2, self.mBases2), (self.mBasesUnique1, self.mBases1), (self.mBasesUnique2, self.mBases2))))
def __str__(self): return "\t".join( map( str, ( self.mGenes1, self.mGenes2, self.mGenesOverlapping1, self.mGenesOverlapping2, self.mGenesUnique1, self.mGenesUnique2, self.mExons1, self.mExons2, self.mExonsOverlapping1, self.mExonsOverlapping2, self.mExonsUnique1, self.mExonsUnique2, self.mBases1, self.mBases2, self.mBasesOverlapping1, self.mBasesOverlapping2, self.mBasesUnique1, self.mBasesUnique2 ) ) ) + "\t" +\ "\t".join( map( lambda x: IOTools.prettyPercent( *x), ( (self.mGenesOverlapping1, self.mGenes1), (self.mGenesOverlapping2, self.mGenes2), (self.mGenesUnique1, self.mGenes1), (self.mGenesUnique2, self.mGenes2), (self.mExonsOverlapping1, self.mExons1), (self.mExonsOverlapping2, self.mExons2), (self.mExonsUnique1, self.mExons1), (self.mExonsUnique2, self.mExons2), (self.mBasesOverlapping1, self.mBases1), (self.mBasesOverlapping2, self.mBases2), (self.mBasesUnique1, self.mBases1), (self.mBasesUnique2, self.mBases2) ) ) )
def summary(infile, outfile): '''compute mapping stats.''' def _getfiles(filename): track = outfile[:-len(".mapped.summary")] if track.endswith(".merged"): xtrack = track[:-len(".merged")] finput = "%s.psl.gz" % xtrack fmerged = "%s.transcripts.transcripts.psl" % xtrack fmapped = "%s.mapped.psl" % track else: finput = "%s.psl.gz" % track fmerged = finput fmapped = "%s.mapped.psl" % track return track, finput, fmerged, fmapped outf = open(outfile, "w") outf.write("track\tinput\tmerged\tpmerged\tmapped\tpmapped\tpoutput\n") def countPSL(filename): if filename.endswith(".gz"): i = gzip.open(filename) else: i = open(filename) ll = [x[:10] for x in i.readlines() if not x.startswith("#")] if ll[0].startswith("psLayout"): return len(ll) - 5 else: return len(ll) track, finput, fmerged, fmapped = _getfiles(outfile) ninput = countPSL(finput) # subtract header nmerged = countPSL(fmerged) - 5 nmapped = countPSL(fmapped) outf.write("%s\t%i\t%i\t%s\t%i\t%s\t%s\n" % (track, ninput, nmerged, IOTools.prettyPercent(nmerged, ninput), nmapped, IOTools.prettyPercent(nmapped, nmerged), IOTools.prettyPercent(nmapped, ninput)))
def _write(outs, text, numerator, denominator, base): percent = IOTools.prettyPercent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))
def main(): parser = E.OptionParser( version = "%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $", usage = globals()["__doc__"]) dbhandle = Database.Database() parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default]." ) parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories [default=%default].") parser.add_option( "-g", "--genes", dest="filename_genes", type="string", help="filename with genes to analyse [default=%default]." ) parser.add_option( "-b", "--background", dest="filename_background", type="string", help="filename with background genes to analyse [default=%default]." ) parser.add_option( "-m", "--minimum-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have fewer than # number of genes" " [default=%default]." ) parser.add_option( "-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio" ), help="output sort order [default=%default]." ) parser.add_option( "--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested separately." " [default=%default]." ) parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values." ) parser.add_option ("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile [default=%default]." ) parser.add_option ("--filename-gene2name", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names [default=%default]." ) parser.add_option ("--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default]." ) parser.add_option ( "--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile [default=%default]." ) parser.add_option ( "--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default]." ) parser.add_option ( "--filename-output-pattern", "--output-filename-pattern", dest = "output_filename_pattern", type="string", help="pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option ( "--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR [ReadGene2GOFromFiledefault=%default]." ) parser.add_option ( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]." ) parser.add_option ( "--gene-pattern", dest = "gene_pattern", type="string", help="pattern to transform identifiers to GO gene names [default=%default].") parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims [default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default]." ) parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background [default=%default]." ) parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices = ( "empirical", "storey", "BH" ), help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default]." ) # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults( species = None, filename_genes = "-", filename_background = None, filename_slims = None, minimum_counts = 0, ontology = [], filename_dump = None, sample = 0, fdr = False, output_filename_pattern = None, threshold = 0.05, filename_map_slims = None, gene_pattern = None, sort_order = "ratio", get_genes = None, strict = False, qvalue_method = "empirical", pairs_min_observed_counts = 3, compute_pairwise = False, filename_gene2name = None ) (options, args) = E.Start( parser, add_mysql_options = True ) if options.go2goslim: convertGo2Goslim( options ) E.Stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn( "fdr will be computed without sampling" ) ############################################################# ## dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = ["biol_process", "mol_function", "cell_location"] E.info( "dumping GO categories to %s" % (options.filename_dump) ) dbhandle.Connect( options ) outfile = IOTools.openFile( options.filename_dump, "w", create_dir = True ) DumpGOFromDatabase( outfile, dbhandle, options ) outfile.close() E.Stop() sys.exit(0) ############################################################# ## read GO categories from file if options.filename_input: E.info( "reading association of categories and genes from %s" % (options.filename_input) ) infile = IOTools.openFile(options.filename_input) gene2gos, go2infos = ReadGene2GOFromFile( infile ) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.openFile( options.filename_gene2name) gene2name = IOTools.readMap( infile, has_header = True ) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: gene2name = None ############################################################# ## read GO ontology from file if options.filename_ontology: E.info( "reading ontology from %s" % (options.filename_ontology) ) infile = IOTools.openFile(options.filename_ontology) ontology = readOntology( infile ) infile.close() def _g(): return collections.defaultdict( GOInfo ) go2infos = collections.defaultdict( _g ); ## substitute go2infos for go in ontology.values(): go2infos[go.mNameSpace][go.mId] = GOInfo( go.mId, go_type = go.mNameSpace, description = go.mName ) ############################################################# ## get foreground gene list input_foreground, genelists = ReadGeneLists( options.filename_genes, gene_pattern = options.gene_pattern ) E.info( "read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists)) ) ############################################################# ## get background if options.filename_background: # nick - bug fix: background is the first tuple element from ReadGeneLists input_background = ReadGeneLists( options.filename_background, gene_pattern = options.gene_pattern )[0] E.info( "read %i genes for background" % len(input_background) ) else: input_background = None ############################################################# ## sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = gene2gos.keys() E.info( "found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append( "\t".join( ( "genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments") ) + "\n" ) ############################################################# ## get go categories for genes for test_ontology in options.ontology: # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info( "working on ontology %s" % test_ontology ) ############################################################# ## get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info( "reading data from database ..." ) dbhandle.Connect( options ) gene2go, go2info = ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species ) E.info( "finished" ) if len(go2info) == 0: E.warn( "could not find information for terms - could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) E.info( "assignments found: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) ) if options.minimum_counts > 0: to_remove = set([ x for x,y in counts_per_category.iteritems() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts ) ) removeCategories( gene2go, to_remove ) ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) E.info( "assignments after filtering: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps) ) for genelist_name, foreground in genelists.iteritems(): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## ## build background - reconcile with foreground ################################################################## if input_background == None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % (len(missing), str(missing)) else: if len(missing) != 0: E.warn( "%i genes in foreground that are not in background - added to background of %i" %\ (len(missing), len(background)) ) background.extend(missing) E.info( "(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) ############################################################# ## sanity checks: ## are all of the foreground genes in the dataset ## missing = set(genes).difference( set(gene2go.keys()) ) ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# ## read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GetGOSlims( IOTools.openFile(options.filename_slims, "r") ) if options.loglevel >=1: v = set() for x in go_slims.values(): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\ ( options.filename_slims, len(go_slims), len( v ) )) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile=IOTools.openFile(options.filename_map_slims, "w" ) outfile.write( "GO\tGOSlim\n" ) for go, go_slim in go_slims.items(): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = MapGO2Slims( gene2go, go_slims, ontology = ontology ) if options.loglevel >=1: ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) options.stdlog.write( "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n" % (ngenes, ncategories, nmaps) ) ############################################################# ## Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in gene2go.items(): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append( gene ) elif gene in background: bg.append( gene ) else: ng.append( gene ) ## skip to next GO class if not (bg or ng): continue options.stdout.write( "# genes in GO category %s\n" % options.get_genes ) options.stdout.write( "gene\tset\n" ) for x in fg: options.stdout.write("%s\t%s\n" % ("fg", x)) for x in bg: options.stdout.write("%s\t%s\n" % ("bg", x)) for x in ng: options.stdout.write("%s\t%s\n" % ("ng", x)) E.info( "nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng) )) E.Stop() sys.exit(0) ############################################################# outfile = getFileName( options, go = test_ontology, section = 'foreground', set = genelist_name ) outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( foreground) ) ) ) if options.output_filename_pattern: outfile.close() outfile = getFileName( options, go = test_ontology, section = 'background', set = genelist_name ) outfile.write ("gene_id\n%s\n" % ("\n".join( sorted( background[0]) ) ) ) if options.output_filename_pattern: outfile.close() ############################################################# ## do the analysis go_results = AnalyseGO( gene2go, foreground, background ) if len(go_results.mSampleGenes) == 0: E.warn( "%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = go_results.mResults.items() ############################################################# ## calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = computeFDRs( go_results, foreground, background, options, test_ontology, gene2go, go2info) for x,v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append( "fdr=%s" % method) if options.sort_order == "fdr": pairs.sort( lambda x, y: cmp(x[1].mQValue, y[1].mQValue)) elif options.sort_order == "ratio": pairs.sort( lambda x, y: cmp(x[1].mRatio, y[1].mRatio)) elif options.sort_order == "pvalue": pairs.sort( lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) ############################################################# ############################################################# ############################################################# ## output the full result outfile = getFileName( options, go = test_ontology, section = 'overall', set = genelist_name ) outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples ) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = selectSignificantResults( pairs, fdrs, options ) nselected = len(filtered_pairs) nselected_up = len( [x for x in filtered_pairs if x[1].mRatio > 1 ] ) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1 ] ) assert nselected_up + nselected_down == nselected outfile = getFileName( options, go = test_ontology, section = 'results', set = genelist_name ) outputResults( outfile, filtered_pairs, go2info, options, fdrs = fdrs, samples = samples ) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append( pairs ) all_significant_results.append( filtered_pairs ) all_genelists_with_results.append( genelist_name ) ############################################################# ############################################################# ############################################################# ## output parameters ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go ) outfile = getFileName( options, go = test_ontology, section = 'parameters', set = genelist_name ) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology )) outfile.write( "parameter\tvalue\tdescription\n" ) outfile.write( "mapped_genes\t%i\tmapped genes\n" % ngenes ) outfile.write( "mapped_categories\t%i\tmapped categories\n" % ncategories ) outfile.write( "mappings\t%i\tmappings\n" % nmaps ) outfile.write( "genes_in_fg\t%i\tgenes in foreground\n" % len(foreground) ) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes)) ) outfile.write( "genes_in_bg\t%i\tinput background\n" % nbackground ) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes)) ) outfile.write( "associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal ) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal ) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ))) outfile.write( "significant\t%i\tsignificant results reported\n" % nselected ) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up ) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down ) outfile.write( "threshold\t%6.4f\tsignificance threshold\n" % options.threshold ) if options.output_filename_pattern: outfile.close() summary.append( "\t".join( map(str, ( \ genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ), IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ), ",".join( msgs) ) ) ) + "\n" ) ############################################################# ############################################################# ############################################################# ## output the fg patterns outfile = getFileName( options, go = test_ontology, section = 'withgenes', set = genelist_name ) outputResults( outfile, pairs, go2info, options, fdrs = fdrs, samples = samples, gene2go = gene2go, foreground = foreground, gene2name = gene2name ) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ###################################################################### ###################################################################### ###################################################################### ## output various summary files ## significant results outputMultipleGeneListResults( all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section = 'significant') ## all results outputMultipleGeneListResults( all_results, all_genelists_with_results, test_ontology, go2info, options, section = 'all') if options.compute_pairwise: pairwiseGOEnrichment( all_results, all_genelists_with_results, test_ontology, go2info, options ) outfile_summary = options.stdout outfile_summary.write( "".join( summary) ) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-m", "--output-mismatches", dest="output_mismatches", action = "store_true", help = "output mismatches [%default]" ) parser.add_option( "-a", "--output-matches", dest="output_matches", action = "store_true", help = "output matches [%default]" ) parser.add_option( "-u", "--output-unique", dest="output_unique", action = "store_true", help = "output unique positions [%default]" ) parser.add_option( "-r", "--restrict", dest="restrict", type = "string", help = "restrict analysis to a chromosome pair (chr1:chr1:+) [%default]" ) parser.set_defaults( output_mismatches = False, output_unique = False, restrict = None ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError( "expected two chain files" ) filename_chain1, filename_chain2 = args E.info( "validating chain 1") if not validateChain( IOTools.openFile( filename_chain1 ) ): E.warn( "validation failed - exiting" ) return 1 E.info( "validating chain 2") if not validateChain( IOTools.openFile( filename_chain2 ) ): E.warn( "validation failed - exiting" ) return 1 E.info( "building pairs for %s" % filename_chain1 ) pairs1 = buildPairs( IOTools.openFile( filename_chain1 ) ) E.info( "read %i pairs" % len(pairs1) ) E.info( "building pairs for %s" % filename_chain2 ) pairs2 = buildPairs( IOTools.openFile( filename_chain2 ) ) E.info( "read %i pairs" % len(pairs2) ) if options.restrict: restrict = tuple(options.restrict.split(":")) pairs1 = { restrict: pairs1[restrict] } pairs2 = { restrict: pairs2[restrict] } E.info( "comparing 1 -> 2") comparison1 = compareChains( pairs1, pairs2 ) E.info( "comparing 2 -> 1") comparison2 = compareChains( pairs2, pairs1 ) all_keys = sorted(list( set(comparison1.keys() + comparison2.keys()))) outfile = options.stdout headers = ("mapped", "identical", "different", "unique") outfile.write( "contig1\tcontig2\tstrand\t%s\t%s\t%s\t%s\n" %\ ( "\t".join( ["%s1" % x for x in headers ] ), "\t".join( ["p%s1" % x for x in headers ] ), "\t".join( ["%s2" % x for x in headers ] ), "\t".join( ["p%s2" % x for x in headers ] ))) totals = E.Counter() for key in all_keys: outfile.write( "%s\t%s\t%s" % key ) if key in comparison1: c = comparison1[key] outfile.write( "\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique ) ) outfile.write( "\t".join( [ IOTools.prettyPercent( x, c.total ) for x in c ] ) ) totals.total1 += c.total totals.same1 += c.same totals.different1 += c.different totals.unique1 += c.unique else: outfile.write( "\t%i\t%i\t%i\t%i\t" % (0,0,0,0) ) outfile.write( "\t%i\t%i\t%i\t%i" % (0,0,0,0) ) if key in comparison2: c = comparison2[key] outfile.write( "\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique ) ) outfile.write( "\t".join( [ IOTools.prettyPercent( x, c.total ) for x in c ] ) ) totals.same2 += c.same totals.total2 += c.total totals.different2 += c.different totals.unique2 += c.unique else: outfile.write( "\t%i\t%i\t%i\t%i\t" % (0,0,0,0) ) outfile.write( "\t%i\t%i\t%i\t%i" % (0,0,0,0) ) outfile.write("\n") outfile.write( "total\ttotal\t.\t" ) outfile.write( "\t".join( map(str, ( totals.total1, totals.same1, totals.different1, totals.unique1, IOTools.prettyPercent( totals.total1, totals.total1 ), IOTools.prettyPercent( totals.same1, totals.total1 ), IOTools.prettyPercent( totals.different1, totals.total1 ), IOTools.prettyPercent( totals.unique1, totals.total1 ), totals.total2, totals.same2, totals.different2, totals.unique2, IOTools.prettyPercent( totals.total2, totals.total2 ), IOTools.prettyPercent( totals.same2, totals.total2 ), IOTools.prettyPercent( totals.different2, totals.total2 ), IOTools.prettyPercent( totals.unique2, totals.total2 ), ) ) ) + "\n" ) # output mismapped residues if options.output_mismatches or options.output_unique: outputMismatches( pairs1, pairs2, output_mismatches = options.output_mismatches, output_unique = options.output_unique, output_matches = options.output_matches, ) ## write footer and output benchmark information. E.Stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories " "[default=%default].") parser.add_option("-g", "--genes-tsv-file", dest="filename_genes", type="string", help="filename with genes to analyse " "[default=%default].") parser.add_option("-b", "--background-tsv-file", dest="filename_background", type="string", help="filename with background genes to analyse " "[default=%default].") parser.add_option("-m", "--min-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have " "fewer than # number of genes" " [default=%default].") parser.add_option("-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option("--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested " "separately [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this " "refers to the fdr, otherwise it is a cutoff for p-values.") parser.add_option("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile " "[default=%default].") parser.add_option( "--gene2name-map-tsv-file", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names " "[default=%default].") parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option("--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile " "[default=%default].") parser.add_option("--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern with output filename pattern " "(should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option("--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR default=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and " "write to STDOUT [default=%default].") parser.add_option("--gene-pattern", dest="gene_pattern", type="string", help="pattern to transform identifiers to GO gene names " "[default=%default].") parser.add_option("--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims " "[default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background " "[default=%default].") parser.add_option( "-q", "--fdr-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None) (options, args) = E.Start(parser, add_database_options=True) if options.go2goslim: GO.convertGo2Goslim(options) E.Stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# # dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location" ] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle = connectToEnsembl(options) outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True) GO.DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.Stop() sys.exit(0) ############################################################# # read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = IOTools.openFile(options.filename_input) gene2gos, go2infos = GO.ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.openFile(options.filename_gene2name) gene2name = IOTools.readMap(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: # use identity mapping gene2name = dict([(x, x) for x in list(gene2gos.keys())]) ############################################################# # read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.openFile(options.filename_ontology) ontology = GO.readOntology(infile) infile.close() def _g(): return collections.defaultdict(GO.GOInfo) go2infos = collections.defaultdict(_g) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# # get foreground gene list input_foreground, genelists = GO.ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# # get background if options.filename_background: # nick - bug fix: background is the first tuple element from # ReadGeneLists input_background = GO.ReadGeneLists( options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# # sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = list(gene2gos.keys()) E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join( ("genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments", "messages")) + "\n") ############################################################# # get go categories for genes for test_ontology in sorted(options.ontology): # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# # get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = GO.ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn("could not find information for terms - " "could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go) E.info("assignments found: %i genes mapped to %i categories " "(%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set([ x for x, y in counts_per_category.items() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) GO.removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) E.info("assignments after filtering: %i genes mapped " "to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) for genelist_name, foreground in sorted(genelists.items()): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## # build background - reconcile with foreground ################################################################## if input_background is None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % ( len(missing), str(missing)) else: if len(missing) != 0: E.warn("%i genes in foreground that are not in " "background - added to background of %i" % (len(missing), len(background))) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) # sort foreground and background, important for reproducibility # under random seed foreground = sorted(foreground) background = sorted(background) ############################################################# # sanity checks: # are all of the foreground genes in the dataset # missing = set(genes).difference( set(gene2go.keys()) ) # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# # read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GO.GetGOSlims( IOTools.openFile(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" % (options.filename_slims, len(go_slims), len(v))) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = IOTools.openFile(options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in sorted(list(go_slims.items())): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to " "%i categories (%i maps)\n" % (ngenes, ncategories, nmaps)) ############################################################# # Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in list(gene2go.items()): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) # skip to next GO class if not (bg or ng): continue options.stdout.write("# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in sorted(fg): options.stdout.write("%s\t%s\n" % ("fg", x)) for x in sorted(bg): options.stdout.write("%s\t%s\n" % ("bg", x)) for x in sorted(ng): options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.Stop() sys.exit(0) ############################################################# outfile = GO.getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = GO.getFileName(options, go=test_ontology, section='background', set=genelist_name) # Jethro bug fix - see section 'build background' for assignment outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background)))) if options.output_filename_pattern: outfile.close() ############################################################# # do the analysis go_results = GO.AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = list(go_results.mResults.items()) ############################################################# # calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = GO.computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(key=lambda x: x[1].mQValue) elif options.sort_order == "ratio": pairs.sort(key=lambda x: x[1].mRatio) elif options.sort_order == "pvalue": pairs.sort(key=lambda x: x[1].mPValue) ############################################################# ############################################################# ############################################################# # output the full result outfile = GO.getFileName(options, go=test_ontology, section='overall', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = GO.getFileName(options, go=test_ontology, section='results', set=genelist_name) GO.outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# # output parameters ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) outfile = GO.getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write("mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes))) outfile.write("associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write("significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write("threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append("\t".join( map(str, (genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.prettyPercent(len(go_results.mSampleGenes), len(foreground), "%5.2f"), IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f"), ",".join(msgs)))) + "\n") ############################################################# ############################################################# ############################################################# # output the fg patterns outfile = GO.getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ################################################################### # output various summary files # significant results GO.outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') # all results GO.outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: GO.pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--output-mismatches", dest="output_mismatches", action="store_true", help="output mismatches [%default]") parser.add_option("-a", "--output-matches", dest="output_matches", action="store_true", help="output matches [%default]") parser.add_option("-u", "--output-unique", dest="output_unique", action="store_true", help="output unique positions [%default]") parser.add_option( "-r", "--restrict", dest="restrict", type="string", help="restrict analysis to a chromosome pair (chr1:chr1:+) [%default]") parser.set_defaults(output_mismatches=False, output_unique=False, restrict=None) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError("expected two chain files") filename_chain1, filename_chain2 = args E.info("validating chain 1") if not validateChain(IOTools.openFile(filename_chain1)): E.warn("validation failed - exiting") return 1 E.info("validating chain 2") if not validateChain(IOTools.openFile(filename_chain2)): E.warn("validation failed - exiting") return 1 E.info("building pairs for %s" % filename_chain1) pairs1 = buildPairs(IOTools.openFile(filename_chain1)) E.info("read %i pairs" % len(pairs1)) E.info("building pairs for %s" % filename_chain2) pairs2 = buildPairs(IOTools.openFile(filename_chain2)) E.info("read %i pairs" % len(pairs2)) if options.restrict: restrict = tuple(options.restrict.split(":")) pairs1 = {restrict: pairs1[restrict]} pairs2 = {restrict: pairs2[restrict]} E.info("comparing 1 -> 2") comparison1 = compareChains(pairs1, pairs2) E.info("comparing 2 -> 1") comparison2 = compareChains(pairs2, pairs1) all_keys = sorted(list(set(comparison1.keys() + comparison2.keys()))) outfile = options.stdout headers = ("mapped", "identical", "different", "unique") outfile.write( "contig1\tcontig2\tstrand\t%s\t%s\t%s\t%s\n" %\ ( "\t".join( ["%s1" % x for x in headers ] ), "\t".join( ["p%s1" % x for x in headers ] ), "\t".join( ["%s2" % x for x in headers ] ), "\t".join( ["p%s2" % x for x in headers ] ))) totals = E.Counter() for key in all_keys: outfile.write("%s\t%s\t%s" % key) if key in comparison1: c = comparison1[key] outfile.write("\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique)) outfile.write("\t".join( [IOTools.prettyPercent(x, c.total) for x in c])) totals.total1 += c.total totals.same1 += c.same totals.different1 += c.different totals.unique1 += c.unique else: outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0)) outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0)) if key in comparison2: c = comparison2[key] outfile.write("\t%i\t%i\t%i\t%i\t" % (c.total, c.same, c.different, c.unique)) outfile.write("\t".join( [IOTools.prettyPercent(x, c.total) for x in c])) totals.same2 += c.same totals.total2 += c.total totals.different2 += c.different totals.unique2 += c.unique else: outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0)) outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0)) outfile.write("\n") outfile.write("total\ttotal\t.\t") outfile.write("\t".join( map(str, ( totals.total1, totals.same1, totals.different1, totals.unique1, IOTools.prettyPercent(totals.total1, totals.total1), IOTools.prettyPercent(totals.same1, totals.total1), IOTools.prettyPercent(totals.different1, totals.total1), IOTools.prettyPercent(totals.unique1, totals.total1), totals.total2, totals.same2, totals.different2, totals.unique2, IOTools.prettyPercent(totals.total2, totals.total2), IOTools.prettyPercent(totals.same2, totals.total2), IOTools.prettyPercent(totals.different2, totals.total2), IOTools.prettyPercent(totals.unique2, totals.total2), ))) + "\n") # output mismapped residues if options.output_mismatches or options.output_unique: outputMismatches( pairs1, pairs2, output_mismatches=options.output_mismatches, output_unique=options.output_unique, output_matches=options.output_matches, ) ## write footer and output benchmark information. E.Stop()