def mapDomains( query_id, matches, map_nid2domains, new_family_id, options ): """map domains onto query_id.""" if options.loglevel >= 1: options.stdlog.write("# attempting to map domains for %s\n" % query_id ) if options.loglevel >= 3: for match in matches: options.stdlog.write("# match=%s\n" % str(match) ) nid = match.mNid if nid in map_nid2domains: for domain in map_nid2domains[nid]: options.stdlog.write("# domain=%s\n" % str(domain) ) else: options.stdlog.write("# no domains for nid %s\n" % nid ) mapped_domains = [] class DomainMatch: def __init__(self, pid, start, end, family): self.mPid = pid self.mStart = start self.mEnd = end self.mFamily = family def __str__ (self ): return "\t".join(map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily))) for match in matches: nid = match.mNid query_length = match.mQueryLength if nid not in map_nid2domains: continue match.buildAlignment() ali = match.mMapSbjct2Query for domain in map_nid2domains[nid]: if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom(): continue start = ali.mapRowToCol( domain.mStart, alignlib.RIGHT ) end = ali.mapRowToCol( domain.mEnd, alignlib.LEFT ) assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (start,end) mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily) ) if options.loglevel >= 1: options.stdlog.write( "# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains) ) ) last_family = None ## sort by matches by family mapped_domains.sort( lambda x, y: cmp( x.mFamily, y.mFamily )) ########################################################## ########################################################## ########################################################## ## combine matches from different sources def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) ) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max( map(lambda x: x.mPid, family_intervals) ) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append( DomainMatch( best_pid, start, end, family_id ) ) last_family = None family_intervals = [] all_intervals = [] min_length_domain = min( options.min_length_domain, query_length - 10 ) for domain in mapped_domains: if last_family != domain.mFamily: processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length ) family_intervals = [] last_family = domain.mFamily family_intervals.append( domain ) processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length ) if options.loglevel >= 2: options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## pick the best domains all_intervals.sort( lambda x, y: cmp( x.mPid * float(x.mEnd-x.mStart), y.mPid * float(y.mEnd - y.mStart)) ) all_intervals.reverse() new_intervals = [] for domain in all_intervals: overlap = Intervals.calculateOverlap( map( lambda x: (x.mStart,x.mEnd), new_intervals), [(domain.mStart,domain.mEnd)] ) if overlap > 0: continue new_intervals.append( domain ) all_intervals = new_intervals if options.loglevel >= 2: options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## add singletons singletons = [] if options.add_singletons: all_singletons = Intervals.complement( map( lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length) for first_res, last_res in all_singletons: if last_res-first_res > options.min_length_singletons: singletons.append( Domain( 0, first_res, last_res, new_family_id ) ) new_family_id += 1 return new_family_id, all_intervals, singletons
def mapDomains(query_id, matches, map_nid2domains, new_family_id, options): """map domains onto query_id.""" if options.loglevel >= 1: options.stdlog.write("# attempting to map domains for %s\n" % query_id) if options.loglevel >= 3: for match in matches: options.stdlog.write("# match=%s\n" % str(match)) nid = match.mNid if nid in map_nid2domains: for domain in map_nid2domains[nid]: options.stdlog.write("# domain=%s\n" % str(domain)) else: options.stdlog.write("# no domains for nid %s\n" % nid) mapped_domains = [] class DomainMatch: def __init__(self, pid, start, end, family): self.mPid = pid self.mStart = start self.mEnd = end self.mFamily = family def __str__(self): return "\t".join( map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily))) for match in matches: nid = match.mNid query_length = match.mQueryLength if nid not in map_nid2domains: continue match.buildAlignment() ali = match.mMapSbjct2Query for domain in map_nid2domains[nid]: if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom( ): continue start = ali.mapRowToCol(domain.mStart, alignlib.RIGHT) end = ali.mapRowToCol(domain.mEnd, alignlib.LEFT) assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % ( start, end) mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily)) if options.loglevel >= 1: options.stdlog.write("# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains))) last_family = None ## sort by matches by family mapped_domains.sort(lambda x, y: cmp(x.mFamily, y.mFamily)) ########################################################## ########################################################## ########################################################## ## combine matches from different sources def processFamily(family_id, family_intervals, all_intervals, min_length_domain, query_length): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map(lambda x: (x.mStart, x.mEnd), family_intervals)) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max(map(lambda x: x.mPid, family_intervals)) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write( "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append(DomainMatch(best_pid, start, end, family_id)) last_family = None family_intervals = [] all_intervals = [] min_length_domain = min(options.min_length_domain, query_length - 10) for domain in mapped_domains: if last_family != domain.mFamily: processFamily(last_family, family_intervals, all_intervals, min_length_domain, query_length) family_intervals = [] last_family = domain.mFamily family_intervals.append(domain) processFamily(last_family, family_intervals, all_intervals, min_length_domain, query_length) if options.loglevel >= 2: options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## pick the best domains all_intervals.sort(lambda x, y: cmp(x.mPid * float(x.mEnd - x.mStart), y.mPid * float(y.mEnd - y.mStart))) all_intervals.reverse() new_intervals = [] for domain in all_intervals: overlap = Intervals.calculateOverlap( map(lambda x: (x.mStart, x.mEnd), new_intervals), [(domain.mStart, domain.mEnd)]) if overlap > 0: continue new_intervals.append(domain) all_intervals = new_intervals if options.loglevel >= 2: options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## add singletons singletons = [] if options.add_singletons: all_singletons = Intervals.complement( map(lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length) for first_res, last_res in all_singletons: if last_res - first_res > options.min_length_singletons: singletons.append(Domain(0, first_res, last_res, new_family_id)) new_family_id += 1 return new_family_id, all_intervals, singletons