def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option( "-o", "--format", dest="graph-format", type="choice", choices=("alignments",), help="graph format [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("shortest-path", "translate", "components", "add-family" ), help="methods to apply [default=%default].") parser.add_option( "-a", "--filename-map", dest="filename_map", type="string", help="filename mapping ids to nids (used for translation) [default=%default].") parser.add_option( "-1", "--node1", dest="node1", type="string", help="first node for path calculation [default=%default].") parser.add_option( "-2", "--node2", dest="node2", type="string", help="second node for path calculation [default=%default].") parser.add_option( "-f", "--filename-families", dest="filename_families", type="string", help="filename with domain families [default=%default].") parser.set_defaults( method = None, graph_format = "alignments", filename_map = None, node1 = None, node2 = None, filename_families = None, ) (options, args) = E.Start( parser, argv = argv ) if options.filename_families != None: E.info( "reading families from %s" % options.filename_families ) map_domain2family = {} for line in open( options.filename_families, "r"): if line[0] == "#": continue if line.startswith( "nid"): continue nid, start, end, family = line[:-1].split("\t") pid = bytes("%s_%s_%s" % (nid,start,end)) map_domain2family[pid] = bytes(family) E.info( "read %i domains" % len(map_domain2family)) if options.method == "translate": if options.filename_map: E.info("reading map from %s" % options.filename_map) map_id2nid = AddaIO.readMapId2Nid( open( options.filename_map, "r") ) map_nid2id = dict([[v,k] for k,v in map_id2nid.iteritems()]) def translate_alignments( line ): if line.startswith("passed"): return line data = line.split( "\t" ) x = data[1].split("_") y = data[2].split("_") try: data[1] = "%s_%s_%s" % (map_nid2id[int(x[0])],x[1],x[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(x) ) raise try: data[2] = "%s_%s_%s" % (map_nid2id[int(y[0])],y[1],y[2]) except KeyError: sys.stderr.write("could not map: %s\n" % str(y) ) raise return "\t".join(data) if options.graph_format == "alignments": translator = translate_alignments for line in options.stdin: if not line.startswith("#"): line = translator( line ) options.stdout.write(line) E.Stop() return elif options.method == "add-family": options.stdout.write( "%s\tqfamily\tsfamily\n" % ("\t".join( AddaIO.TestedLink._fields))) for link in AddaIO.iterate_tested_links( options.stdin ): qfamily = map_domain2family.get(link.qdomain,"na") sfamily = map_domain2family.get(link.sdomain,"na") options.stdout.write( "%s\t%s\t%s\n" % ("\t".join(map(str,link)), qfamily, sfamily)) E.Stop() return t = time.time() if options.graph_format == "alignments": map_vertex2id, map_id2vertex, G = readAlignmentGraph( options.stdin ) E.info( "graph read in %i seconds" % (time.time() - t )) t = time.time() if options.method == "shortest-path": E.debug( "shortest path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) paths = G.get_shortest_paths( map_vertex2id[options.node1], to = (map_vertex2id[options.node2],) ) p = paths[map_vertex2id[options.node2]] if len(p) == 0: E.info( "no path between %s:%i and %s:%i" % \ (options.node1, map_vertex2id[options.node1], options.node2, map_vertex2id[options.node2] ) ) l, last_node = p[0], map_id2vertex[p[0]] for x in p[1:]: node = map_id2vertex[x] ei = G.get_eid(x, l) options.stdout.write( "%s\t%s\t%s\n" %\ (last_node, node, G.es[ei]["info"]) ) l, last_node = x, node elif options.method == "components": print "component\tnode" for id, component in enumerate(nx.connected_components( G )): for c in component: print "%i\t%s" % (id,c) E.info( "%s: %i seconds" % (options.method, time.time() - t )) E.Stop()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option("-D", "--database", dest="database", type="string", help="tablename to use [default=%default]." ) parser.add_option("-t", "--trees", dest="table_name_trees", type="string", help="tablename with trees [default=%default]." ) parser.add_option("-r", "--parts", dest="table_name_parts", type="string", help="tablename with trees [default=%default]." ) parser.add_option("-b", "--bench", dest="table_name_bench", type="string", help="domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]." ) parser.add_option("-f", "--reference", dest="table_name_reference", type="string", help="table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]." ) parser.add_option( "--bin-size", dest="bin_size", type="int", help="bin size [default=%default]." ) parser.add_option( "-o", "--resolution", dest="resolution", type="float", help="resolution for scaling of domains [default=%default]." ) parser.add_option("-s", "--switch", dest="switch", action = "store_true", help="switch between coverage of reference and size ratio if coverage is 1 [default=%default]." ) parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action = "store_true", help="[default=%default]." ) parser.add_option("-m", "--skip-tms", dest="skip_tms", action = "store_true", help="discard domains which contain transmembrane regions [default=%default]." ) parser.add_option("-e", "--check-selection", dest="check_selection", action = "store_true", help="[default=%default]." ) parser.add_option("-q", "--quality", dest="quality", action = "store_true", help="take only sequences which are curated [default=%default]." ) parser.add_option( "--no-full-length", dest="no_full_length", action = "store_true", help="[default=%default]." ) parser.add_option( "--only-full-length", dest="only_full_length", action = "store_true", help="[default=%default]." ) parser.add_option( "--check-if-comparable", dest="check_if_comparable", action = "store_true", help="perform comparable check according to Islam95 (default level 85%) [default=%default]." ) parser.add_option( "--subset", dest="subset", type = "string", help = "use only a subset of nids [default=%default]" ) parser.set_defaults( database = "pairsdb", table_name_reference = None, table_name_trees = None, table_name_parts = None, table_name_bench = None, resolution = None, loglevel = 1, min_overlap = 1, switch = 0, combine_repeats = 1, skip_repeats = 0, skip_tms = 0, discard_full_length = 0, check_selection = 0, selection_threshold = 0.9, quality = None, no_full_length = None, only_full_length = None, ## a full length domain should cover at least 90% of a sequence min_length_ratio = 0.9, check_comparable = None, check_comparable_level = 0.85, bin_size = 1, subset = None ) (options, args) = E.Start( parser, argv = argv, add_output_options = True ) dbhandle = Pairsdb() dbhandle.Connect( dbname = options.database ) tbl_reference = TableDomains(dbhandle, "generic") tbl_reference.SetName(options.table_name_reference) # tbl_masks = Table_nrdb90_masks(dbhandle) tbl_nrdb = Table_nrdb( dbhandle ) # todo: encapsulate this with a parameter tbl_nrdb.name = "nrdb40" if options.table_name_trees: nids_statement = '''SELECT DISTINCT t.nid FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\ (options.table_name_trees, options.table_name_reference) if options.quality: nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("","") statement = """ SELECT t.node, t.parent, t.level, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = options.table_name_trees elif options.table_name_parts or options.table_name_bench: if options.table_name_parts: table_name = options.table_name_parts else: table_name = options.table_name_bench if options.subset: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS t WHERE t.nid = s.nid''' % (options.subset, table_name) else: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS r %%s WHERE r.nid = s.nid %%s''' %\ (table_name, options.table_name_reference) if options.quality: nids_statement = nids_statement % (", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("","") statement = """ SELECT 1, 0, 0, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = table_name else: print "what shall I compare?" sys.exit(1) if options.check_selection: selection_statement = """ SELECT t.domain_from, t.domain_to, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (t.domain_to - t.domain_from)i) AS cov_dom, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref FROM %(selection_tablename)s AS t WHERE t.domain_nid = %(nid)i AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ selection_tablename = options.table_name_parts options.table_name_parts = None parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0,0,0,0 min_overlap = options.min_overlap nids = map(lambda x:x[0], dbhandle.Execute(nids_statement).fetchall()) overlaps = [] cov_doms = [] cov_refs = [] touched = {} if options.check_selection: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) else: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) E.info( "--> processing %i nids" % len(nids) ) nskipped_no_assignments = 0 nskipped_no_overlap = 0 nskipped_wrong_domaintype = 0 nfound = 0 it = 0 for nid in nids: it += 1 E.debug( "--> processing %i" % nid ) domains = tbl_reference.GetDomainBoundariesForNid( nid ) length = tbl_nrdb.GetLength( nid ) if not domains: nskipped_no_assignments +=1 continue if options.no_full_length and len(domains) == 1: ## check if domain is actually full length, otherwise keep id, domain_from, domain_to = domains[0] if float(domain_to-domain_from) / float(length) >= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue if options.only_full_length: if len(domains) == 1: id, domain_from, domain_to = domains[0] if float(domain_to-domain_from) / float(length) <= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue else: nskipped_wrong_domaintype += 1 continue nfound += 1 last_id = None x = 0 # iteration over domains in reference while x < len(domains): id, domain_from, domain_to = domains[x] ########################################################## # process repeats is_repeat = -1 while x < len(domains) and domains[x][0] == id: domain_to = domains[x][2] x += 1 is_repeat += 1 if options.skip_repeats and is_repeat: continue # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to): # continue ########################################################## ## apply resolution if options.resolution: start = int(float(domain_from-1)/options.resolution) end = int(float(domain_to-1)/options.resolution) + 1 else: start = domain_from end = domain_to E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \ ( id, domain_from, domain_to, start, end)) ########################################################## ## get best matching domain s = statement % locals() if options.loglevel >= 4: print s result = dbhandle.Execute(s).fetchone() if not result: continue node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result key = "%i-%s-%i-%i" % (nid, id, start, end) if touched.has_key(key): continue else: touched[key] = 1 # discard full length domains if options.discard_full_length: if options.table_name_trees: if node == 0: continue else: if length == end - start: continue if options.switch and cov_ref == 1.0: xcov_ref = rat_ref else: xcov_ref = cov_ref # check, if selection did take a domain lower or further up if options.check_selection: start = (start * 10) + 1 end = min(end * 10 + 1, length) s = selection_statement % locals() result = dbhandle.Execute(s).fetchone() if result: parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result if rat_parts > 1.0: parts_larger_than_trees += 1 token = ">" elif rat_parts == 1.0: parts_same_as_trees += 1 token = "=" else: parts_smaller_than_trees += 1 token = "<" if rat_parts < options.selection_threshold: parts_much_smaller_than_trees += 1 options.stdout.write(string.join(map(str, (nid, id, domain_from, domain_to, level, yfrom, yto, parts_from, parts_to, overlap, cov_dom, cov_ref, rat_ref, xcov_ref, ovl_parts, cov_parts, cov_tree, rat_parts, token)), "\t") + "\n") else: options.stdout.write(string.join(map(str, (nid, node, parent, level, start, end, id, start, end, overlap, cov_dom, cov_ref, rat_ref, xcov_ref)), "\t") + "\n") overlaps.append( int(overlap * 100) ) cov_doms.append( int(cov_dom * 100) ) cov_refs.append( int(xcov_ref * 100) ) E.info( "skipped nids because of no overlap with reference: %i" % nskipped_no_overlap ) E.info( "skipped nids because of no assignments: %i" % nskipped_no_assignments ) E.info( "skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype) E.info( "nids in comparison: %i" % nfound) if options.check_selection: E.info( " parts larger than trees=", parts_larger_than_trees ) E.info( " parts like trees=", parts_same_as_trees ) E.info( " parts smaller than trees=", parts_smaller_than_trees ) E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees ) else: outfile_stats = E.openOutputFile( "stats" ) outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader()) outfile_stats.write("overlaps\t%s\n" % str( Stats.Summary( overlaps ) ) ) outfile_stats.write("domain_coverage\t%s\n" % str( Stats.Summary( cov_doms ) ) ) outfile_stats.write("reference_coverage\t%s\n" % str( Stats.Summary( cov_refs ) ) ) outfile_stats.close() outfile = E.openOutputFile( "overlaps.histogram" ) outfile.write( "bin\tcounts\n") Histogram.Write(outfile, Histogram.Calculate( overlaps, min_value=0, increment=1, no_empty_bins = True)) outfile.close() outfile = E.openOutputFile( "domain_coverage.histogram" ) outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write(outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate( cov_doms, min_value=0, increment=options.bin_size, no_empty_bins = True))) outfile.close() outfile = E.openOutputFile( "reference_coverage.histogram" ) outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write(outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate( cov_refs, min_value=0, increment=options.bin_size, no_empty_bins = True))) outfile.close() E.Stop()
## delete old table if it exists try: cc = dbhandle.cursor() cc.execute("DROP TABLE %s" % options.tablename) cc.close() if options.loglevel >= 1: options.stdlog.write( "# existing table %s deleted\n" % options.tablename ) except error, msg: dbhandle.rollback() except error, msg: pass ## create new table statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join( columns)) E.debug( "table create:\n# %s" % (statement ) ) try: cc = dbhandle.cursor() cc.execute(statement) cc.close() except error, msg: options.stderr.write( "table creation failed: statement=\n %s\n" % (statement ) ) raise error, msg E.info("table %s created successfully." % options.tablename ) return take, map_column2type, ignored def main():
cc = dbhandle.cursor() cc.execute("DROP TABLE %s" % options.tablename) cc.close() if options.loglevel >= 1: options.stdlog.write("# existing table %s deleted\n" % options.tablename) except error, msg: dbhandle.rollback() except error, msg: pass ## create new table statement = "CREATE TABLE %s ( %s );" % (options.tablename, ", ".join(columns)) E.debug("table create:\n# %s" % (statement)) try: cc = dbhandle.cursor() cc.execute(statement) cc.close() except error, msg: options.stderr.write("table creation failed: statement=\n %s\n" % (statement)) raise error, msg E.info("table %s created successfully." % options.tablename) return take, map_column2type, ignored
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main(argv=sys.argv): parser = optparse.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-D", "--database", dest="database", type="string", help="tablename to use [default=%default].") parser.add_option("-t", "--trees", dest="table_name_trees", type="string", help="tablename with trees [default=%default].") parser.add_option("-r", "--parts", dest="table_name_parts", type="string", help="tablename with trees [default=%default].") parser.add_option( "-b", "--bench", dest="table_name_bench", type="string", help= "domain table to be benchmarked (for example: nrdb40_domo_domains_nr) [default=%default]." ) parser.add_option( "-f", "--reference", dest="table_name_reference", type="string", help= "table of reference table (for example: nrdb40_scop_domains_nr) [default=%default]." ) parser.add_option("--bin-size", dest="bin_size", type="int", help="bin size [default=%default].") parser.add_option( "-o", "--resolution", dest="resolution", type="float", help="resolution for scaling of domains [default=%default].") parser.add_option( "-s", "--switch", dest="switch", action="store_true", help= "switch between coverage of reference and size ratio if coverage is 1 [default=%default]." ) parser.add_option("-k", "--skip-repeats", dest="skip_repeats", action="store_true", help="[default=%default].") parser.add_option( "-m", "--skip-tms", dest="skip_tms", action="store_true", help= "discard domains which contain transmembrane regions [default=%default]." ) parser.add_option("-e", "--check-selection", dest="check_selection", action="store_true", help="[default=%default].") parser.add_option( "-q", "--quality", dest="quality", action="store_true", help="take only sequences which are curated [default=%default].") parser.add_option("--no-full-length", dest="no_full_length", action="store_true", help="[default=%default].") parser.add_option("--only-full-length", dest="only_full_length", action="store_true", help="[default=%default].") parser.add_option( "--check-if-comparable", dest="check_if_comparable", action="store_true", help= "perform comparable check according to Islam95 (default level 85%) [default=%default]." ) parser.add_option("--subset", dest="subset", type="string", help="use only a subset of nids [default=%default]") parser.set_defaults( database="pairsdb", table_name_reference=None, table_name_trees=None, table_name_parts=None, table_name_bench=None, resolution=None, loglevel=1, min_overlap=1, switch=0, combine_repeats=1, skip_repeats=0, skip_tms=0, discard_full_length=0, check_selection=0, selection_threshold=0.9, quality=None, no_full_length=None, only_full_length=None, ## a full length domain should cover at least 90% of a sequence min_length_ratio=0.9, check_comparable=None, check_comparable_level=0.85, bin_size=1, subset=None) (options, args) = E.Start(parser, argv=argv, add_output_options=True) dbhandle = Pairsdb() dbhandle.Connect(dbname=options.database) tbl_reference = TableDomains(dbhandle, "generic") tbl_reference.SetName(options.table_name_reference) # tbl_masks = Table_nrdb90_masks(dbhandle) tbl_nrdb = Table_nrdb(dbhandle) # todo: encapsulate this with a parameter tbl_nrdb.name = "nrdb40" if options.table_name_trees: nids_statement = '''SELECT DISTINCT t.nid FROM %s AS t, %s AS s %%s WHERE t.nid = s.nid %%s''' %\ (options.table_name_trees, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT t.node, t.parent, t.level, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = options.table_name_trees elif options.table_name_parts or options.table_name_bench: if options.table_name_parts: table_name = options.table_name_parts else: table_name = options.table_name_bench if options.subset: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS t WHERE t.nid = s.nid''' % (options.subset, table_name) else: nids_statement = '''SELECT DISTINCT s.nid FROM %s AS s, %s AS r %%s WHERE r.nid = s.nid %%s''' %\ (table_name, options.table_name_reference) if options.quality: nids_statement = nids_statement % ( ", nrdb_quality AS q", "AND q.nid = s.nid AND q.is_curated = 'T'") else: nids_statement = nids_statement % ("", "") statement = """ SELECT 1, 0, 0, t.start, t.end, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (GREATEST( t.end, %(end)i) - LEAST( t.start, %(start)i))) AS ovl, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (t.end - t.start)) AS cov_dom, ((LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.end - t.start) / (%(end)i - %(start)i)) AS rat_ref FROM %(tablename)s AS t WHERE t.nid = %(nid)i AND (LEAST(t.end, %(end)i) - GREATEST(t.start, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ tablename = table_name else: print "what shall I compare?" sys.exit(1) if options.check_selection: selection_statement = """ SELECT t.domain_from, t.domain_to, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (GREATEST( t.domain_to, %(end)i) - LEAST( t.domain_from, %(start)i))) AS ovl, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (t.domain_to - t.domain_from)i) AS cov_dom, ((LEAST(t.domain_to, %(end)i) - GREATEST(t.domain_from, %(start)i)) / (%(end)i - %(start)i)) AS cov_ref, ((t.domain_to - t.domain_from) / (%(end)i - %(start)i)) AS rat_ref FROM %(selection_tablename)s AS t WHERE t.domain_nid = %(nid)i AND (LEAST(t.domain_to, %(start)i) - GREATEST(t.domain_from, %(start)i) > %(min_overlap)i) ORDER BY ovl DESC LIMIT 1 """ selection_tablename = options.table_name_parts options.table_name_parts = None parts_same_as_trees, parts_larger_than_trees, parts_smaller_than_trees, parts_much_smaller_than_trees = 0, 0, 0, 0 min_overlap = options.min_overlap nids = map(lambda x: x[0], dbhandle.Execute(nids_statement).fetchall()) overlaps = [] cov_doms = [] cov_refs = [] touched = {} if options.check_selection: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) else: options.stdout.write( "NID\tDNODE\tDPARENT\tDLEVEL\tDFROM\tDTO\tRID\tRFROM\tRTO\tOVL\tDCOV\tRCOV\tRRCOV\tMRCOV\n" ) E.info("--> processing %i nids" % len(nids)) nskipped_no_assignments = 0 nskipped_no_overlap = 0 nskipped_wrong_domaintype = 0 nfound = 0 it = 0 for nid in nids: it += 1 E.debug("--> processing %i" % nid) domains = tbl_reference.GetDomainBoundariesForNid(nid) length = tbl_nrdb.GetLength(nid) if not domains: nskipped_no_assignments += 1 continue if options.no_full_length and len(domains) == 1: ## check if domain is actually full length, otherwise keep id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float(length) >= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue if options.only_full_length: if len(domains) == 1: id, domain_from, domain_to = domains[0] if float(domain_to - domain_from) / float( length) <= options.min_length_ratio: nskipped_wrong_domaintype += 1 continue else: nskipped_wrong_domaintype += 1 continue nfound += 1 last_id = None x = 0 # iteration over domains in reference while x < len(domains): id, domain_from, domain_to = domains[x] ########################################################## # process repeats is_repeat = -1 while x < len(domains) and domains[x][0] == id: domain_to = domains[x][2] x += 1 is_repeat += 1 if options.skip_repeats and is_repeat: continue # if options.skip_tms and tbl_masks.HasMask( nid, 2, domain_from, domain_to): # continue ########################################################## ## apply resolution if options.resolution: start = int(float(domain_from - 1) / options.resolution) end = int(float(domain_to - 1) / options.resolution) + 1 else: start = domain_from end = domain_to E.debug( "processing domain %s_%i_%i (scaled: %i-%i)" % \ ( id, domain_from, domain_to, start, end)) ########################################################## ## get best matching domain s = statement % locals() if options.loglevel >= 4: print s result = dbhandle.Execute(s).fetchone() if not result: continue node, parent, level, start, end, overlap, cov_dom, cov_ref, rat_ref = result key = "%i-%s-%i-%i" % (nid, id, start, end) if touched.has_key(key): continue else: touched[key] = 1 # discard full length domains if options.discard_full_length: if options.table_name_trees: if node == 0: continue else: if length == end - start: continue if options.switch and cov_ref == 1.0: xcov_ref = rat_ref else: xcov_ref = cov_ref # check, if selection did take a domain lower or further up if options.check_selection: start = (start * 10) + 1 end = min(end * 10 + 1, length) s = selection_statement % locals() result = dbhandle.Execute(s).fetchone() if result: parts_from, parts_to, ovl_parts, cov_parts, cov_tree, rat_parts = result if rat_parts > 1.0: parts_larger_than_trees += 1 token = ">" elif rat_parts == 1.0: parts_same_as_trees += 1 token = "=" else: parts_smaller_than_trees += 1 token = "<" if rat_parts < options.selection_threshold: parts_much_smaller_than_trees += 1 options.stdout.write( string.join( map(str, (nid, id, domain_from, domain_to, level, yfrom, yto, parts_from, parts_to, overlap, cov_dom, cov_ref, rat_ref, xcov_ref, ovl_parts, cov_parts, cov_tree, rat_parts, token)), "\t") + "\n") else: options.stdout.write( string.join( map(str, (nid, node, parent, level, start, end, id, start, end, overlap, cov_dom, cov_ref, rat_ref, xcov_ref)), "\t") + "\n") overlaps.append(int(overlap * 100)) cov_doms.append(int(cov_dom * 100)) cov_refs.append(int(xcov_ref * 100)) E.info("skipped nids because of no overlap with reference: %i" % nskipped_no_overlap) E.info("skipped nids because of no assignments: %i" % nskipped_no_assignments) E.info("skipped nids because of wrong domain type: %i" % nskipped_wrong_domaintype) E.info("nids in comparison: %i" % nfound) if options.check_selection: E.info(" parts larger than trees=", parts_larger_than_trees) E.info(" parts like trees=", parts_same_as_trees) E.info(" parts smaller than trees=", parts_smaller_than_trees) E.info( " parts much smaller than trees (<%f)=" % options.selection_threshold, parts_much_smaller_than_trees) else: outfile_stats = E.openOutputFile("stats") outfile_stats.write("section\t%s\n" % Stats.Summary().getHeader()) outfile_stats.write("overlaps\t%s\n" % str(Stats.Summary(overlaps))) outfile_stats.write("domain_coverage\t%s\n" % str(Stats.Summary(cov_doms))) outfile_stats.write("reference_coverage\t%s\n" % str(Stats.Summary(cov_refs))) outfile_stats.close() outfile = E.openOutputFile("overlaps.histogram") outfile.write("bin\tcounts\n") Histogram.Write( outfile, Histogram.Calculate(overlaps, min_value=0, increment=1, no_empty_bins=True)) outfile.close() outfile = E.openOutputFile("domain_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_doms, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() outfile = E.openOutputFile("reference_coverage.histogram") outfile.write( "bin\tcounts\tfreq\tcumul_counts\tcumul_freq\treverse_counts\treverse_freq\n" ) Histogram.Write( outfile, Histogram.AddRelativeAndCumulativeDistributions( Histogram.Calculate(cov_refs, min_value=0, increment=options.bin_size, no_empty_bins=True))) outfile.close() E.Stop()