def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof
def readAlignmentGraph( infile ): result = [ x[:-1].split("\t") for x in infile.readlines() if not x.startswith("#") or not x.startswith( "passed") ] E.info( "collected %i edges" % len(result) ) # collect vertices vertices = set( [x[1] for x in result ] ) vertices.update( [x[2] for x in result ] ) vertices = list( vertices ) map_vertex2id = dict( [ (x[1],x[0]) for x in enumerate(vertices) ] ) E.info( "collected %i vertices" % len(vertices ) ) G = igraph.Graph( len(vertices) ) G.add_edges( [ (map_vertex2id[x[1]], \ map_vertex2id[x[2]]) for x in result ] ) G.es[ "info" ] = [ "%s\t%s\t%s\t%s\t%s" % \ (x[0], x[10], x[11], x[16], x[17] ) for x in result ] return map_vertex2id, vertices, G
def annotateAlignmentGraph(infile, outfiles): '''input the alignment graph and output a translated version of it and adding reference domain information. ''' outfile, outfile_stats = outfiles # collect benchmark domains E.info("reading benchmark domains") benchmark_domains = AddaIO.readMapNid2Domains( gzip.open(PARAMS["eval_filename_benchmark_domains"])) totuple = AddaIO.toTuple toDomain = AddaIO.toDomain # build map of id to nid E.info("reading map between pid and nid") map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r")) def getOverlappingDomains(pid, start, end): '''get domains overlapping pid:start..end''' if pid not in benchmark_domains: return () # greedy overlap testing r = [] for family, domains in benchmark_domains[pid].iteritems(): for other_start, other_end in domains: if start >= other_end or end <= other_start: continue r.append((family, other_start, other_end)) return r counts = E.Counter() if infile.endswith(".gz"): inf = gzip.open(infile) else: inf = open(infile) outf = gzip.open(outfile, "w") outf.write("%s\n" % "\t".join( ("passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore", "rfamilies", "sfamilies", "rdomains", "sdomains"))) # counts for true positives, false positives and unknown n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0 outf_stats = open(outfile_stats, "w") outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n") last_weight = None for link in AddaIO.iterate_tested_links(inf): qnid, qstart, qend = totuple(link.qdomain) snid, sstart, send = totuple(link.sdomain) qpid = map_nid2pid[qnid] spid = map_nid2pid[snid] qfamily = sorted(getOverlappingDomains(qpid, qstart, qend)) sfamily = sorted(getOverlappingDomains(spid, sstart, send)) passed = link.passed == "+" n += 1 if not qfamily and not sfamily: uk += 1 else: qf = set([x[0] for x in qfamily]) sf = set([x[0] for x in sfamily]) if qf.intersection(sf): if passed: tp += 1 else: fn += 1 else: if passed: fp += 1 else: tn += 1 weight = round(float(link.weight)) if weight != last_weight: if last_weight != None: outf_stats.write("\t".join( map(str, ( last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp + fp), float(fn) / (fn + tn + 0.00001), ))) + "\n") last_weight = weight if passed: counts.passed += 1 else: counts.failed += 1 link = link._replace(qdomain=toDomain((qpid, qstart, qend)), sdomain=toDomain((spid, sstart, send))) outf.write( "%s\t%s\t%s\t%s\t%s\n" % \ ("\t".join( map(str,link) ), ",".join( sorted(set([x[0] for x in qfamily])) ), ",".join( sorted(set([x[0] for x in sfamily])) ), ",".join("%s_%i_%i" % x for x in qfamily ), ",".join("%s_%i_%i" % x for x in sfamily ))) inf.close() outf_stats.write("\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp + fp), float(fn) / (fn + tn)))) + "\n") outf_stats.close() E.info("%s" % str(counts))
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option( "-o", "--format", dest="graph-format", type="choice", choices=("alignments",), help="graph format [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("shortest-path", "translate", "components", "add-family" ), help="methods to apply [default=%default].") parser.add_option( "-a", "--filename-map", dest="filename_map", type="string", help="filename mapping ids to nids (used for translation) [default=%default].") parser.add_option( "-1", "--node1", dest="node1", type="string", help="first node for path calculation [default=%default].") parser.add_option( "-2", "--node2", dest="node2", type="string", help="second node for path calculation [default=%default].") parser.add_option( "-f", "--filename-families", dest="filename_families", type="string", help="filename with domain families [default=%default].") parser.set_defaults( method = None, graph_format = "alignments", filename_map = None, node1 = None, node2 = None, filename_families = None, ) (options, args) = E.Start( parser, argv = argv ) if options.filename_families != None: E.info( "reading families from %s" % options.filename_families ) map_domain2family = {} for line in open( options.filename_families, "r"): if line[0] == "#": continue if line.startswith( "nid"): continue nid, start, end, family = line[:-1].split("\t") pid = bytes("%s_%s_%s" % (nid,start,end)) map_domain2family[pid] = bytes(family) E.info( "read %i domains" % len(map_domain2family)) if options.method == "translate": if options.filename_map: E.info("reading map from %s" % options.filename_map) map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") ) map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()]) def translate_alignments( line ): if line.startswith("passed"): return line data = line.split( "\t" ) x = data[1].split("_") y = data[2].split("_") try: data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(x) ) raise try: data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(y) ) raise return "\t".join(data) if options.graph_format == "alignments": translator = translate_alignments for line in options.stdin: if not line.startswith("#"): line = translator( line ) options.stdout.write(line) E.Stop() return elif options.method == "add-family": options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields))) for link in AddaIO.iterate_tested_links( options.stdin ): qfamily = map_domain2family.get(link.qdomain,"na") sfamily = map_domain2family.get(link.sdomain,"na") options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), qfamily, sfamily)) E.Stop() return t = time.time() if options.graph_format == "alignments": map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin ) E.info( "graph read in %i seconds" % (time.time() - t )) t = time.time() if options.method == "shortest-path": E.debug( "shortest path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) paths = G.get_shortest_paths( map_vertex2id[options.node1], to = (map_vertex2id[options.node2],) ) p = paths[map_vertex2id[options.node2]] if len(p) == 0: E.info( "no path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) l, last_node = p[0], map_id2vertex[p[0]] for x in p[1:]: node = map_id2vertex[x] ei = G.get_eid(x, l) options.stdout.write( "%s\t%s\t%s\n" %\ (last_node, node, G.es[ei]["info"]) ) l, last_node = x, node elif options.method == "components": print "component\tnode" for id, component in enumerate(nx.connected_components( G )): for c in component: print "%i\t%s" % (id,c) E.info( "%s: %i seconds" % (options.method, time.time() - t )) E.Stop()
def main(): global L parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--config", dest="filename_config", type="string", help="configuration file [default=%default].") parser.add_option("--force", dest="force", action="store_true", help="overwrite existing files [default=%default].") parser.add_option( "--continue", dest="append", action="store_true", help= "continue from an aborted run and append to existing files [default=%default]." ) parser.add_option( "--test", dest="test", type="int", help="run a test with first # sequences [default=%default]") parser.add_option( "--num-jobs", dest="num_jobs", type="int", help= "use # processes. If not set, the number of CPUs/cores is taken [default=%default]" ) parser.add_option( "--chunks", dest="chunks", type="string", help= "work on one or more chunks only. Provide a comma-separated list. [default=%default]" ) parser.add_option("--command", dest="command", type="choice", choices=("sequences", "blast", "fit", "graph", "index", "check-index", "profiles", "segment", "optimise", "convert", "mst", "mst-components", "align", "cluster", "realign", "families", "stats", "summary"), help="perform a command [default=%default]") parser.add_option("--start-at", dest="start_at", type="string", help="start at sequence [default=%default]") parser.add_option("--stop-at", dest="stop_at", type="string", help="stop at sequenec [default=%default]") parser.set_defaults( filename_config="adda.ini", command=None, start_at=None, stop_at=None, force=False, append=False, test=None, num_jobs=None, chunks="all", ) (options, args) = E.Start(parser) # setup logging if options.loglevel == 0: lvl = logging.ERROR elif options.loglevel == 1: lvl = logging.INFO else: lvl = logging.DEBUG logQueue = multiprocessing.Queue(100) handler = Logger.MultiProcessingLogHandler( logging.FileHandler("adda.log", "a"), logQueue) handler.setFormatter( logging.Formatter( '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M')) logging.getLogger('adda').addHandler(handler) logging.getLogger('adda').setLevel(lvl) E.setLogger(logging.getLogger("adda")) L = logging.getLogger("adda") config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) if len(args) == 0: if not options.command: raise ValueError("specify at least one command") elif len(args) == 1: options.command = args[0] else: raise ValueError("one command line argument is sufficient.") ## collect modules and initialise them map_module = { 'fit': AddaFit.AddaFit, 'segment': AddaSegment.AddaSegment, 'blast': AddaBlast.AddaBlast, 'graph': AddaGraph.AddaGraph, 'stats': AddaStats.AddaStats, 'profiles': AddaProfiles.AddaProfiles, 'realign': AddaAlign.AddaRealign, 'index': AddaIndex.AddaIndexBuild, 'check-index': AddaIndex.AddaIndexCheck, 'optimise': AddaOptimise.AddaOptimise, 'sequences': AddaSequences.AddaSequences, 'convert': AddaConvert.AddaConvert, 'mst': AddaMst.AddaMst, 'mst-components': AddaComponentsMst.AddaComponentsMst, 'align': AddaAlign.AddaAlign, 'cluster': AddaCluster.AddaCluster, 'families': AddaFamilies.AddaFamilies, 'summary': AddaSummary.AddaSummary, } try: fasta = IndexedFasta.IndexedFasta( config.get("files", "output_fasta", "adda")) except KeyError: fasta = None if options.num_jobs == 1: run_parallel = runSequentially else: run_parallel = runParallel kwargs = { "loglevel": options.loglevel, "append": options.append, "force": options.force } if options.command == "index": module = map_module[options.command](config, fasta=fasta, **kwargs) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command) else: filename_graph = config.get("files", "input_graph", "pairsdb_40x40.links.gz") if "," in filename_graph: filename_graph = filename_graph.split(",") # permit parallel processing of multiple files run_parallel( run_on_files, filename=filename_graph, options=options, module=map_module[options.command], config=config, kwargs=kwargs, ) nchunks = len(filename_graph) module = map_module[options.command](config, chunk=0, num_chunks=nchunks, **kwargs) if not module.isComplete(): L.info("merging") if not module.merge(): raise ValueError("error while merging for `%s`" % options.command) else: # process single file - no hazzle. module.startUp() module.run() module.finish() if options.command in ("sequences", "stats", "optimise", "convert", "mst", "mst-components", "cluster", "families", "summary"): module = map_module[options.command](config, fasta=fasta, **kwargs) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command) else: module.startUp() module.run() module.finish() elif options.command in ("fit", "segment"): run_on_graph = RunOnGraph(config, options.command) run_parallel(run_on_graph, filename=config.get("files", "input_graph", "adda.graph"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) if not merge(options, module=map_module[options.command], config=config, fasta=fasta): E.Stop() return elif options.command in ("align"): run_parallel(run_on_file, filename=config.get("files", "output_mst", "adda.mst"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) merge(options, module=map_module[options.command], config=config, fasta=fasta) elif options.command in ("realign"): run_parallel(run_on_file, filename=config.get("files", "output_align", "adda.align"), options=options, module=map_module[options.command], config=config, kwargs=kwargs) merge(options, module=map_module[options.command], config=config, fasta=fasta)
continue except IndexError: continue if mi == None: mi = v else: mi = min(v, mi) if ma == None: ma = v else: ma = max(v, ma) options.min_weight = mi options.max_weight = ma E.info("using automatic weight range from %f to %f\n" % (options.min_weight, options.max_weight)) else: lines = sys.stdin options.min_weight, options.max_weight = map( float, options.weight_range.split(",")) options.stdout.write("graph: {\n") if options.filename_format_graph: options.stdout.write( string.join(open(options.filename_format_graph, "r").readlines())) else: options.stdout.write(FORMAT_GRAPH) if options.add_edge_labels: options.stdout.write("display_edge_labels: yes\n")
id, acc, len = rx_head.match(record[0]).groups() except AttributeError, msg: E.warn("parsing error in line `%s`" % record[0]) nskipped += 1 continue if options.no_swissprot_version: acc = acc.split(".")[0] for line in record[1:]: # no Pfam-B if line.startswith("Pfam-B"): continue name, family, description, coordinates = rx_domain.match( line).groups() for c in coordinates.split(" "): start, end = [int(x) for x in c.split("-")] start -= 1 options.stdout.write( options.prefix + "\t".join(map(str, (acc, start, end, family))) + "\n") ndomains += 1 noutput += 1 E.info("ninput=%i, noutput=%i, ndomains=%i, nerrors=%i" % (ninput, noutput, ndomains, nskipped)) E.Stop() if __name__ == "__main__": sys.exit(main())
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option("-D", "--database", dest="database", type="string", help="tablename to use [default=%default]." ) parser.add_option("-t", "--trees", dest="table_name_trees", type="string", help="tablename with trees [default=%default]." ) parser.add_option("-r", "--parts", dest="table_name_parts", type="string", help="tablename with trees [default=%default]." ) parser.add_option("-b", "--bench", dest="table_name_bench", type="string", help="domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]." ) parser.add_option("-f", "--reference", dest="table_name_reference", type="string", help="table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]." ) parser.add_option( "--bin-size", dest="bin_size", type="int", help="bin size [default=%default]." ) parser.add_option( "-o", "--resolution", dest="resolution", type="float", help="resolution for scaling of domains [default=%default]." ) parser.add_option("-s", "--switch", dest="switch", action = "store_true", help="switch between coverage of reference and size ratio if coverage is 1 [default=%default]." ) parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action = "store_true", help="[default=%default]." ) parser.add_option("-m", "--skip-tms", dest="skip_tms", action = "store_true", help="discard domains which contain transmembrane regions [default=%default]." ) parser.add_option("-e", "--check-selection", dest="check_selection", action = "store_true", help="[default=%default]." ) parser.add_option("-q", "--quality", dest="quality", action = "store_true", help="take only sequences which are curated [default=%default]." ) parser.add_option( "--no-full-length", dest="no_full_length", action = "store_true", help="[default=%default]." ) parser.add_option( "--only-full-length", dest="only_full_length", action = "store_true", help="[default=%default]." ) parser.add_option( "--check-if-comparable", dest="check_if_comparable", action = "store_true", help="perform comparable check according to Islam95 (default level 85%) [default=%default]." ) parser.add_option( "--subset", dest="subset", type = "string", help = "use only a subset of nids [default=%default]" ) parser.set_defaults( database = "pairsdb", table_name_reference = None, table_name_trees = None, table_name_parts = None, table_name_bench = None, resolution = None, loglevel = 1, min_overlap = 1, switch = 0, combine_repeats = 1, skip_repeats = 0, skip_tms = 0, discard_full_length = 0, check_selection = 0, selection_threshold = 0.9, quality = None, no_full_length = None, only_full_length = None, ## a full length domain should cover at least 90% of a sequence min_length_ratio = 0.9, check_comparable = None, check_comparable_level = 0.85, bin_size = 1, subset = None ) (options, args) = E.Start( parser, argv = argv, add_output_options = True ) dbhandle = Pairsdb() dbhandle.Connect( dbname = options.database ) tbl_reference = TableDomains(dbhandle, "generic") tbl_reference.SetName(options.table_name_reference) # tbl_masks = Table_nrdb90_masks(dbhandle) tbl_nrdb = Table_nrdb( dbhandle ) # todo: encapsulate this with a parameter tbl_nrdb.name = "nrdb40" if options.table_name_trees: nids_statement = '''SELECT DISTINCT t.nid FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\ (options.table_name_trees, options.table_name_reference) if options.quality: nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("","") statement = """ SELECT t.node, t.parent, t.level, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = options.table_name_trees elif options.table_name_parts or options.table_name_bench: if options.table_name_parts: table_name = options.table_name_parts else: table_name = options.table_name_bench if options.subset: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS t WHERE t.nid = s.nid''' % (options.subset, table_name) else: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS r %%s WHERE r.nid = s.nid %%s''' %\ (table_name, options.table_name_reference) if options.quality: nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("","") statement = """ SELECT 1, 0, 0, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = table_name else: print "what shall I compare?" sys.exit(1) if options.check_selection: selection_statement = """ SELECT t.domain_from, t.domain_to, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (t.domain_to - t.domain_from)i) AS cov_dom, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref FROM %(selection_tablename)s AS t WHERE t.domain_nid = %(nid)i AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ selection_tablename = options.table_name_parts options.table_name_parts = None parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0,0,0,0 min_overlap = options.min_overlap nids = map(lambda x:x[0], dbhandle.Execute(nids_statement).fetchall()) overlaps = [] cov_doms = [] cov_refs = [] touched = {} if options.check_selection: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) else: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) E.info( "--> processing %i nids" % len(nids) ) nskipped_no_assignments = 0 nskipped_no_overlap = 0 nskipped_wrong_domaintype = 0 nfound = 0 it = 0 for nid in nids: it += 1 E.debug( "--> processing %i" % nid ) domains = tbl_reference.GetDomainBoundariesForNid( nid ) length = tbl_nrdb.GetLength( nid ) if not domains: nskipped_no_assignments +=1 continue if options.no_full_length and len(domains) == 1: ## check if domain is actually full length, otherwise keep id, domain_from, domain_to = domains[0] if float(domain_to-domain_from) / float(length) >= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue if options.only_full_length: if len(domains) == 1: id, domain_from, domain_to = domains[0] if float(domain_to-domain_from) / float(length) <= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue else: nskipped_wrong_domaintype += 1 continue nfound += 1 last_id = None x = 0 # iteration over domains in reference while x < len(domains): id, domain_from, domain_to = domains[x] ########################################################## # process repeats is_repeat = -1 while x < len(domains) and domains[x][0] == id: domain_to = domains[x][2] x += 1 is_repeat += 1 if options.skip_repeats and is_repeat: continue # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to): # continue ########################################################## ## apply resolution if options.resolution: start = int(float(domain_from-1)/options.resolution) end = int(float(domain_to-1)/options.resolution) + 1 else: start = domain_from end = domain_to E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \ ( id, domain_from, domain_to, start, end)) ########################################################## ## get best matching domain s = statement % locals() if options.loglevel >= 4: print s result = dbhandle.Execute(s).fetchone() if not result: continue node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result key = "%i-%s-%i-%i" % (nid, id, start, end) if touched.has_key(key): continue else: touched[key] = 1 # discard full length domains if options.discard_full_length: if options.table_name_trees: if node == 0: continue else: if length == end - start: continue if options.switch and cov_ref == 1.0: xcov_ref = rat_ref else: xcov_ref = cov_ref # check, if selection did take a domain lower or further up if options.check_selection: start = (start * 10) + 1 end = min(end * 10 + 1, length) s = selection_statement % locals() result = dbhandle.Execute(s).fetchone() if result: parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result if rat_parts > 1.0: parts_larger_than_trees += 1 token = ">" elif rat_parts == 1.0: parts_same_as_trees += 1 token = "=" else: parts_smaller_than_trees += 1 token = "<" if rat_parts < options.selection_threshold: parts_much_smaller_than_trees += 1 options.stdout.write(string.join(map(str, (nid, id, domain_from, domain_to, level, yfrom, yto, parts_from, parts_to, overlap, cov_dom, cov_ref, rat_ref, xcov_ref, ovl_parts, cov_parts, cov_tree, rat_parts, token)), "\t") + "\n") else: options.stdout.write(string.join(map(str, (nid, node, parent, level, start, end, id, start, end, overlap, cov_dom, cov_ref, rat_ref, xcov_ref)), "\t") + "\n") overlaps.append( int(overlap * 100) ) cov_doms.append( int(cov_dom * 100) ) cov_refs.append( int(xcov_ref * 100) ) E.info( "skipped nids because of no overlap with reference: %i" % nskipped_no_overlap ) E.info( "skipped nids because of no assignments: %i" % nskipped_no_assignments ) E.info( "skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype) E.info( "nids in comparison: %i" % nfound) if options.check_selection: E.info( " parts larger than trees=", parts_larger_than_trees ) E.info( " parts like trees=", parts_same_as_trees ) E.info( " parts smaller than trees=", parts_smaller_than_trees ) E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees ) else: outfile_stats = E.openOutputFile( "stats" ) outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader()) outfile_stats.write("overlaps\t%s\n" % str( Stats.Summary( overlaps ) ) ) outfile_stats.write("domain_coverage\t%s\n" % str( Stats.Summary( cov_doms ) ) ) outfile_stats.write("reference_coverage\t%s\n" % str( Stats.Summary( cov_refs ) ) ) outfile_stats.close() outfile = E.openOutputFile( "overlaps.histogram" ) outfile.write( "bin\tcounts\n") Histogram.Write(outfile, Histogram.Calculate( overlaps, min_value=0, increment=1, no_empty_bins = True)) outfile.close() outfile = E.openOutputFile( "domain_coverage.histogram" ) outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write(outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate( cov_doms, min_value=0, increment=options.bin_size, no_empty_bins = True))) outfile.close() outfile = E.openOutputFile( "reference_coverage.histogram" ) outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write(outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate( cov_refs, min_value=0, increment=options.bin_size, no_empty_bins = True))) outfile.close() E.Stop()
def annotateAlignmentGraph( infile, outfiles ): '''input the alignment graph and output a translated version of it and adding reference domain information. ''' outfile, outfile_stats = outfiles # collect benchmark domains E.info( "reading benchmark domains" ) benchmark_domains = AddaIO.readMapNid2Domains( gzip.open( PARAMS["eval_filename_benchmark_domains"] ) ) totuple = AddaIO.toTuple toDomain = AddaIO.toDomain # build map of id to nid E.info( "reading map between pid and nid" ) map_nid2pid = AddaIO.readMapPid2Nid( open(PARAMS["eval_filename_adda_nids"], "r") ) def getOverlappingDomains( pid, start, end ): '''get domains overlapping pid:start..end''' if pid not in benchmark_domains: return () # greedy overlap testing r = [] for family, domains in benchmark_domains[pid].iteritems(): for other_start, other_end in domains: if start >= other_end or end <= other_start: continue r.append( (family, other_start, other_end) ) return r counts = E.Counter() if infile.endswith(".gz"): inf = gzip.open( infile ) else: inf = open(infile) outf = gzip.open( outfile, "w" ) outf.write( "%s\n" % "\t".join( ( "passed", "qdomain", "sdomain", "weight", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore", "rfamilies", "sfamilies", "rdomains", "sdomains")) ) # counts for true positives, false positives and unknown n, tp, fp, fn, tn, uk = 0, 0, 0, 0, 0, 0 outf_stats = open( outfile_stats, "w" ) outf_stats.write("weight\tn\tproportion\ttp\tfp\tfn\ttn\tuk\ttpr\tfnr\n" ) last_weight = None for link in AddaIO.iterate_tested_links( inf ): qnid, qstart, qend = totuple(link.qdomain) snid, sstart, send = totuple(link.sdomain) qpid = map_nid2pid[qnid] spid = map_nid2pid[snid] qfamily = sorted(getOverlappingDomains( qpid, qstart, qend )) sfamily = sorted(getOverlappingDomains( spid, sstart, send )) passed = link.passed == "+" n += 1 if not qfamily and not sfamily: uk += 1 else: qf = set( [x[0] for x in qfamily] ) sf = set( [x[0] for x in sfamily] ) if qf.intersection( sf ): if passed: tp += 1 else: fn += 1 else: if passed: fp += 1 else: tn += 1 weight = round(float(link.weight)) if weight != last_weight: if last_weight != None: outf_stats.write( "\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp+fp), float(fn) / (fn+tn+0.00001), ) ) ) + "\n" ) last_weight = weight if passed: counts.passed += 1 else: counts.failed += 1 link = link._replace( qdomain=toDomain( (qpid, qstart, qend) ), sdomain=toDomain( (spid, sstart, send) )) outf.write( "%s\t%s\t%s\t%s\t%s\n" % \ ("\t".join( map(str,link) ), ",".join( sorted(set([x[0] for x in qfamily])) ), ",".join( sorted(set([x[0] for x in sfamily])) ), ",".join("%s_%i_%i" % x for x in qfamily ), ",".join("%s_%i_%i" % x for x in sfamily ))) inf.close() outf_stats.write( "\t".join( map(str, (last_weight, n, tp, fp, fn, tn, uk, float(tp) / (tp+fp), float(fn) / (fn+tn) ) ) ) + "\n" ) outf_stats.close() E.info( "%s" % str( counts ) )
try: id, acc, len = rx_head.match( record[0] ).groups() except AttributeError, msg: E.warn( "parsing error in line `%s`" % record[0]) nskipped += 1 continue if options.no_swissprot_version: acc = acc.split(".")[0] for line in record[1:]: # no Pfam-B if line.startswith( "Pfam-B"): continue name, family, description, coordinates = rx_domain.match( line ).groups() for c in coordinates.split( " "): start,end = [ int(x) for x in c.split("-") ] start -= 1 options.stdout.write( options.prefix + "\t".join( map(str, (acc, start, end, family) ) ) + "\n" ) ndomains += 1 noutput += 1 E.info( "ninput=%i, noutput=%i, ndomains=%i, nerrors=%i" % (ninput, noutput, ndomains, nskipped)) E.Stop() if __name__ == "__main__": sys.exit(main())
pass ## create new table statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join( columns)) E.debug( "table create:\n# %s" % (statement ) ) try: cc = dbhandle.cursor() cc.execute(statement) cc.close() except error, msg: options.stderr.write( "table creation failed: statement=\n %s\n" % (statement ) ) raise error, msg E.info("table %s created successfully." % options.tablename ) return take, map_column2type, ignored def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE) parser.add_option( "--dialect", dest="dialect", type="string", help="csv dialect to use [default=%default]." ) parser.add_option("-m", "--map", dest="map", type="string", action="append", help="explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." ) parser.add_option("-t", "--table", dest="tablename", type="string", help="table name for all backends [default=%default]." )
## create new table statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join(columns)) E.debug("table create:\n# %s" % (statement)) try: cc = dbhandle.cursor() cc.execute(statement) cc.close() except error, msg: options.stderr.write("table creation failed: statement=\n %s\n" % (statement)) raise error, msg E.info("table %s created successfully." % options.tablename) return take, map_column2type, ignored def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--dialect", dest="dialect", type="string", help="csv dialect to use [default=%default].") parser.add_option( "-m",
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"]) parser.add_option( "-n", "--nids", dest="filename_nids", type="string", help="filename with nids[default=%default].") parser.add_option( "-c", "--column", dest="columns", type="int", action="append", help="columns with nids to translate (1-based) [default=%default].") parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true", help="translate domain ids [default=%default].") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="invert mapping [default=%default].") parser.add_option( "-e", "--no-header", dest="no_header", action="store_true", help="file has no header [default=%default].") parser.set_defaults( filename_nids = "adda.nids", columns = [], is_domains = False, invert = False, noheader = False, ) (options, args) = E.Start( parser ) map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") ) if options.invert: E.info( "inverting mapping" ) map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] ) if len(options.columns) == 0: options.columns = [1] columns = [x-1 for x in options.columns ] toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain first = not options.no_header is_domains = options.is_domains ninput, noutput, nskipped = 0, 0, 0 for line in options.stdin: if line.startswith("#"): options.stdout.write(line) continue if first: options.stdout.write(line) first = False continue ninput += 1 data = line[:-1].split("\t") for x in columns: if is_domains: try: d = toTuple(data[x]) except ValueError: E.warn( "could not parse domain `%s`" % data[x]) nskipped += 1 break try: data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) ) except (IndexError, KeyError): E.warn( "could not map domain `%s`" % data[x]) nskipped += 1 break else: try: data[x] = str(map_nid2pid[int(data[x])]) except IndexError: E.warn( "could not map nid `%s`" % data[x]) nskipped += 1 break else: options.stdout.write("%s\n" % "\t".join(data)) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main(): global L parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE ) parser.add_option( "--config", dest="filename_config", type="string", help="configuration file [default=%default].") parser.add_option( "--force", dest="force", action="store_true", help="overwrite existing files [default=%default].") parser.add_option( "--continue", dest="append", action="store_true", help="continue from an aborted run and append to existing files [default=%default].") parser.add_option( "--test", dest="test", type="int", help="run a test with first # sequences [default=%default]") parser.add_option( "--num-jobs", dest="num_jobs", type="int", help="use # processes. If not set, the number of CPUs/cores is taken [default=%default]") parser.add_option( "--chunks", dest="chunks", type="string", help = "work on one or more chunks only. Provide a comma-separated list. [default=%default]" ) parser.add_option( "--command", dest="command", type="choice", choices=( "sequences", "blast", "fit", "graph", "index", "check-index", "profiles", "segment", "optimise", "convert", "mst", "mst-components", "align", "cluster", "realign", "families", "stats", "summary"), help="perform a command [default=%default]" ) parser.add_option( "--start-at", dest="start_at", type="string", help="start at sequence [default=%default]") parser.add_option( "--stop-at", dest="stop_at", type="string", help="stop at sequenec [default=%default]") parser.set_defaults( filename_config = "adda.ini", command = None, start_at = None, stop_at = None, force = False, append = False, test = None, num_jobs = None, chunks = "all", ) (options, args) = E.Start( parser ) # setup logging if options.loglevel == 0: lvl = logging.ERROR elif options.loglevel == 1: lvl = logging.INFO else: lvl = logging.DEBUG logQueue = multiprocessing.Queue(100) handler = Logger.MultiProcessingLogHandler(logging.FileHandler( "adda.log", "a"), logQueue) handler.setFormatter( logging.Formatter( '%(asctime)s pid=%(process)-8d %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M' ) ) logging.getLogger('adda').addHandler(handler) logging.getLogger('adda').setLevel( lvl ) E.setLogger( logging.getLogger( "adda" ) ) L = logging.getLogger( "adda" ) config = AddaIO.ConfigParser() config.read( os.path.expanduser( options.filename_config ) ) if len(args) == 0: if not options.command: raise ValueError("specify at least one command") elif len(args) == 1: options.command = args[0] else: raise ValueError("one command line argument is sufficient.") ## collect modules and initialise them map_module = { 'fit' : AddaFit.AddaFit, 'segment' : AddaSegment.AddaSegment, 'blast' : AddaBlast.AddaBlast, 'graph' : AddaGraph.AddaGraph, 'stats' : AddaStats.AddaStats, 'profiles' : AddaProfiles.AddaProfiles, 'realign' : AddaAlign.AddaRealign, 'index' : AddaIndex.AddaIndexBuild, 'check-index' : AddaIndex.AddaIndexCheck, 'optimise' : AddaOptimise.AddaOptimise, 'sequences' : AddaSequences.AddaSequences, 'convert' : AddaConvert.AddaConvert, 'mst' : AddaMst.AddaMst, 'mst-components' : AddaComponentsMst.AddaComponentsMst, 'align' : AddaAlign.AddaAlign, 'cluster' : AddaCluster.AddaCluster, 'families' : AddaFamilies.AddaFamilies, 'summary' : AddaSummary.AddaSummary, } try: fasta = IndexedFasta.IndexedFasta( config.get( "files", "output_fasta", "adda" ) ) except KeyError: fasta = None if options.num_jobs == 1: run_parallel = runSequentially else: run_parallel = runParallel kwargs = { "loglevel" : options.loglevel, "append" : options.append, "force": options.force } if options.command == "index": module = map_module[options.command](config, fasta = fasta, **kwargs ) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command ) else: filename_graph = config.get( "files", "input_graph", "pairsdb_40x40.links.gz") if "," in filename_graph: filename_graph = filename_graph.split(",") # permit parallel processing of multiple files run_parallel( run_on_files, filename = filename_graph, options = options, module = map_module[options.command], config = config, kwargs = kwargs, ) nchunks = len( filename_graph ) module = map_module[options.command]( config, chunk = 0, num_chunks = nchunks, **kwargs ) if not module.isComplete(): L.info( "merging" ) if not module.merge(): raise ValueError("error while merging for `%s`" % options.command ) else: # process single file - no hazzle. module.startUp() module.run() module.finish() if options.command in ("sequences", "stats", "optimise", "convert", "mst", "mst-components", "cluster", "families", "summary" ): module = map_module[options.command]( config, fasta = fasta, **kwargs ) if module.isComplete(): E.info("output of command `%s` present and complete" % options.command ) else: module.startUp() module.run() module.finish() elif options.command in ("fit", "segment"): run_on_graph = RunOnGraph( config, options.command ) run_parallel( run_on_graph, filename = config.get( "files", "input_graph", "adda.graph" ), options = options, module = map_module[options.command], config = config, kwargs = kwargs ) if not merge( options, module = map_module[options.command], config = config, fasta = fasta ): E.Stop() return elif options.command in ("align" ): run_parallel( run_on_file, filename = config.get( "files", "output_mst", "adda.mst" ), options = options, module = map_module[options.command], config = config, kwargs = kwargs ) merge( options, module = map_module[options.command], config = config, fasta = fasta ) elif options.command in ("realign" ): run_parallel( run_on_file, filename = config.get( "files", "output_align", "adda.align" ), options = options, module = map_module[options.command], config = config, kwargs = kwargs ) merge( options, module = map_module[options.command], config = config, fasta = fasta )
continue except IndexError: continue if mi == None: mi = v else: mi = min(v, mi) if ma == None: ma = v else: ma = max(v, ma) options.min_weight = mi options.max_weight = ma E.info( "using automatic weight range from %f to %f\n" % (options.min_weight, options.max_weight) ) else: lines = sys.stdin options.min_weight, options.max_weight = map(float, options.weight_range.split(",")) options.stdout.write( "graph: {\n" ) if options.filename_format_graph: options.stdout.write( string.join(open(options.filename_format_graph, "r").readlines() ) ) else: options.stdout.write( FORMAT_GRAPH ) if options.add_edge_labels: options.stdout.write( "display_edge_labels: yes\n" ) left_nodes = {}
def main(argv=sys.argv): parser = optparse.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-D", "--database", dest="database", type="string", help="tablename to use [default=%default].") parser.add_option("-t", "--trees", dest="table_name_trees", type="string", help="tablename with trees [default=%default].") parser.add_option("-r", "--parts", dest="table_name_parts", type="string", help="tablename with trees [default=%default].") parser.add_option( "-b", "--bench", dest="table_name_bench", type="string", help= "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]." ) parser.add_option( "-f", "--reference", dest="table_name_reference", type="string", help= "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]." ) parser.add_option("--bin-size", dest="bin_size", type="int", help="bin size [default=%default].") parser.add_option( "-o", "--resolution", dest="resolution", type="float", help="resolution for scaling of domains [default=%default].") parser.add_option( "-s", "--switch", dest="switch", action="store_true", help= "switch between coverage of reference and size ratio if coverage is 1 [default=%default]." ) parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action="store_true", help="[default=%default].") parser.add_option( "-m", "--skip-tms", dest="skip_tms", action="store_true", help= "discard domains which contain transmembrane regions [default=%default]." ) parser.add_option("-e", "--check-selection", dest="check_selection", action="store_true", help="[default=%default].") parser.add_option( "-q", "--quality", dest="quality", action="store_true", help="take only sequences which are curated [default=%default].") parser.add_option("--no-full-length", dest="no_full_length", action="store_true", help="[default=%default].") parser.add_option("--only-full-length", dest="only_full_length", action="store_true", help="[default=%default].") parser.add_option( "--check-if-comparable", dest="check_if_comparable", action="store_true", help= "perform comparable check according to Islam95 (default level 85%) [default=%default]." ) parser.add_option("--subset", dest="subset", type="string", help="use only a subset of nids [default=%default]") parser.set_defaults( database="pairsdb", table_name_reference=None, table_name_trees=None, table_name_parts=None, table_name_bench=None, resolution=None, loglevel=1, min_overlap=1, switch=0, combine_repeats=1, skip_repeats=0, skip_tms=0, discard_full_length=0, check_selection=0, selection_threshold=0.9, quality=None, no_full_length=None, only_full_length=None, ## a full length domain should cover at least 90% of a sequence min_length_ratio=0.9, check_comparable=None, check_comparable_level=0.85, bin_size=1, subset=None) (options, args) = E.Start(parser, argv=argv, add_output_options=True) dbhandle = Pairsdb() dbhandle.Connect(dbname=options.database) tbl_reference = TableDomains(dbhandle, "generic") tbl_reference.SetName(options.table_name_reference) # tbl_masks = Table_nrdb90_masks(dbhandle) tbl_nrdb = Table_nrdb(dbhandle) # todo: encapsulate this with a parameter tbl_nrdb.name = "nrdb40" if options.table_name_trees: nids_statement = '''SELECT DISTINCT t.nid FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\ (options.table_name_trees, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT t.node, t.parent, t.level, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = options.table_name_trees elif options.table_name_parts or options.table_name_bench: if options.table_name_parts: table_name = options.table_name_parts else: table_name = options.table_name_bench if options.subset: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS t WHERE t.nid = s.nid''' % (options.subset, table_name) else: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS r %%s WHERE r.nid = s.nid %%s''' %\ (table_name, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT 1, 0, 0, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = table_name else: print "what shall I compare?" sys.exit(1) if options.check_selection: selection_statement = """ SELECT t.domain_from, t.domain_to, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (t.domain_to - t.domain_from)i) AS cov_dom, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref FROM %(selection_tablename)s AS t WHERE t.domain_nid = %(nid)i AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ selection_tablename = options.table_name_parts options.table_name_parts = None parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0 min_overlap = options.min_overlap nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall()) overlaps = [] cov_doms = [] cov_refs = [] touched = {} if options.check_selection: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) else: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) E.info("--> processing %i nids" % len(nids)) nskipped_no_assignments = 0 nskipped_no_overlap = 0 nskipped_wrong_domaintype = 0 nfound = 0 it = 0 for nid in nids: it += 1 E.debug("--> processing %i" % nid) domains = tbl_reference.GetDomainBoundariesForNid(nid) length = tbl_nrdb.GetLength(nid) if not domains: nskipped_no_assignments += 1 continue if options.no_full_length and len(domains) == 1: ## check if domain is actually full length, otherwise keep id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float(length) >= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue if options.only_full_length: if len(domains) == 1: id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float( length) <= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue else: nskipped_wrong_domaintype += 1 continue nfound += 1 last_id = None x = 0 # iteration over domains in reference while x < len(domains): id, domain_from, domain_to = domains[x] ########################################################## # process repeats is_repeat = -1 while x < len(domains) and domains[x][0] == id: domain_to = domains[x][2] x += 1 is_repeat += 1 if options.skip_repeats and is_repeat: continue # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to): # continue ########################################################## ## apply resolution if options.resolution: start = int(float(domain_from - 1) / options.resolution) end = int(float(domain_to - 1) / options.resolution) + 1 else: start = domain_from end = domain_to E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \ ( id, domain_from, domain_to, start, end)) ########################################################## ## get best matching domain s = statement % locals() if options.loglevel >= 4: print s result = dbhandle.Execute(s).fetchone() if not result: continue node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result key = "%i-%s-%i-%i" % (nid, id, start, end) if touched.has_key(key): continue else: touched[key] = 1 # discard full length domains if options.discard_full_length: if options.table_name_trees: if node == 0: continue else: if length == end - start: continue if options.switch and cov_ref == 1.0: xcov_ref = rat_ref else: xcov_ref = cov_ref # check, if selection did take a domain lower or further up if options.check_selection: start = (start * 10) + 1 end = min(end * 10 + 1, length) s = selection_statement % locals() result = dbhandle.Execute(s).fetchone() if result: parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result if rat_parts > 1.0: parts_larger_than_trees += 1 token = ">" elif rat_parts == 1.0: parts_same_as_trees += 1 token = "=" else: parts_smaller_than_trees += 1 token = "<" if rat_parts < options.selection_threshold: parts_much_smaller_than_trees += 1 options.stdout.write( string.join( map(str, (nid, id, domain_from, domain_to, level, yfrom, yto, parts_from, parts_to, overlap, cov_dom, cov_ref, rat_ref, xcov_ref, ovl_parts, cov_parts, cov_tree, rat_parts, token)), "\t") + "\n") else: options.stdout.write( string.join( map(str, (nid, node, parent, level, start, end, id, start, end, overlap, cov_dom, cov_ref, rat_ref, xcov_ref)), "\t") + "\n") overlaps.append(int(overlap * 100)) cov_doms.append(int(cov_dom * 100)) cov_refs.append(int(xcov_ref * 100)) E.info("skipped nids because of no overlap with reference: %i" % nskipped_no_overlap) E.info("skipped nids because of no assignments: %i" % nskipped_no_assignments) E.info("skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype) E.info("nids in comparison: %i" % nfound) if options.check_selection: E.info(" parts larger than trees=", parts_larger_than_trees) E.info(" parts like trees=", parts_same_as_trees) E.info(" parts smaller than trees=", parts_smaller_than_trees) E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees) else: outfile_stats = E.openOutputFile("stats") outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader()) outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps))) outfile_stats.write("domain_coverage\t%s\n" % str(Stats.Summary(cov_doms))) outfile_stats.write("reference_coverage\t%s\n" % str(Stats.Summary(cov_refs))) outfile_stats.close() outfile = E.openOutputFile("overlaps.histogram") outfile.write("bin\tcounts\n") Histogram.Write( outfile, Histogram.Calculate(overlaps, min_value=0, increment=1, no_empty_bins=True)) outfile.close() outfile = E.openOutputFile("domain_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_doms, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() outfile = E.openOutputFile("reference_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_refs, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() E.Stop()