def getComponents(matches, max_distance=0, min_overlap=0, by_query=False): """return overlapping matches. max_distance allow reads to be joined if they are # residues apart. Adjacent reads are 1 residue apart, overlapping reads are 0 residues apart min_overlap require at least # residues to be overlapping """ addAlignments(matches, by_query=by_query) components = Components.IComponents() for x in range(0, len(matches)): components.add(x, x) if min_overlap > 0 and max_distance > 0: raise ValueError( "both min_overlap (%i) and max_distance (%i) > 0" % (min_overlap, max_distance)) if by_query: if min_overlap > 0: f = lambda x, y: alignlib_lite.py_getAlignmentOverlap( matches[x].mMapQuery2Target, matches[y].mMapQuery2Target, alignlib_lite.py_RR) >= min_overlap else: f = lambda x, y: alignlib_lite.py_getAlignmentShortestDistance( matches[x].mMapQuery2Target, matches[y].mMapQuery2Target, alignlib_lite.py_RR) <= max_distance else: if min_overlap > 0: f = lambda x, y: alignlib_lite.py_getAlignmentOverlap( matches[x].mMapTarget2Query, matches[y].mMapTarget2Query, alignlib_lite.py_RR) >= min_overlap else: f = lambda x, y: alignlib_lite.py_getAlignmentShortestDistance( matches[x].mMapTarget2Query, matches[y].mMapTarget2Query, alignlib_lite.py_RR) <= max_distance for x in range(len(matches)): for y in range(0, x): if f(x, y): components.add(x, y) return components.getComponents()
def MapTranscripts2Genes(transcripts, map_transcript2location): """map all orthologous and overlapping transcripts into genes. The new gene is chosen at random. """ graph = Components.SComponents() map_id2info = {} added = set() for transcript in transcripts: map_id2info[transcript.mTranscript] = ( transcript.mSchema, transcript.mQuality) token1, strand1, from1, to1 = map_transcript2location[ transcript.mTranscript] # add link to self as otherwise the component is empty graph.add(transcript.mTranscript, transcript.mTranscript) # add link to overlapping transcripts for x in added: token2, strand2, from2, to2 = map_transcript2location[x] if token1 == token2 and strand1 == strand2 and \ min(to1, to2) - max(from1, from2) > 0: graph.add(transcript.mTranscript, x) added.add(transcript.mTranscript) components = graph.getComponents() new_genes = {} new_transcripts = [] for component in components: g = component[0] new_genes[g] = [] for id in component: s, q = map_id2info[id] t = Orthologs.Transcript() t.mSchema = s t.mTranscript = id t.mGene = g t.mQuality = q new_genes[g].append(t) new_transcripts.append(t) return new_transcripts, new_genes
import CGAT.Components as Components c = Components.SComponents() links = (("1", "2"), ("1", "3"), ("2", "3"), ("3", "3"), ("4", "5"), ("6", "6")) for a, b in links: print a, b, c.add(a, b) for x in "01234567": print x, c.get(x) print c.getNumNodes() print c.getComponents() c = Components.IComponents() for a, b in links: print a, b, c.add(int(a), int(b)) for x in range(0, 8): print x, c.get(x) print c.getNumNodes() print c.getComponents() c = Components.IComponents() print c.getComponents()