def __get_cds_rel_pos(self, cdss, cds_id, aln_loc): '''Returns position of CDS relative to alignment. @param [Cds] List of CDSs @param (Location) cds_id CDS we are looking at @param (Location) aln_loc Alignment Location @return (string) LEFT_OF_ALN - fully left of alignment RIGHT_OF_ALN - fully right or not first which overlaps FIRST - first to overlap ''' cds_loc = Location.from_location_str(cdss[cds_id].location) if (cds_loc.end < aln_loc.start): return "LEFT_OF_ALN" if (cds_loc.start > aln_loc.end): return "RIGHT_OF_ALN" # Overlap occured - check if it is first # If it is first CDS or the previous one does not overlap if (cds_id == 0): return "FIRST" cds_prev_loc = Location.from_location_str(cdss[cds_id - 1].location) if (not self.__overlap(cds_prev_loc, aln_loc)): return "FIRST" return "RIGHT_OF_ALN"
def __get_cds_rel_pos (self, cdss, cds_id, aln_loc): '''Returns position of CDS relative to alignment. @param [Cds] List of CDSs @param (Location) cds_id CDS we are looking at @param (Location) aln_loc Alignment Location @return (string) LEFT_OF_ALN - fully left of alignment RIGHT_OF_ALN - fully right or not first which overlaps FIRST - first to overlap ''' cds_loc = Location.from_location_str(cdss[cds_id].location) if (cds_loc.end < aln_loc.start): return "LEFT_OF_ALN" if (cds_loc.start > aln_loc.end): return "RIGHT_OF_ALN" # Overlap occured - check if it is first # If it is first CDS or the previous one does not overlap if (cds_id == 0): return "FIRST" cds_prev_loc = Location.from_location_str(cdss[cds_id - 1].location) if (not self.__overlap(cds_prev_loc, aln_loc)): return "FIRST" return "RIGHT_OF_ALN"
def testInstersectionWithoutComplementInformation(self): l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('complement(5..15)') self.assertTrue(l1.intersects(l2, use_complement=False), 'The locations should intersect') self.assertFalse(l1.intersects(l2, use_complement=True), "The locations should't intersect")
def testSingleLocation(self): location = Location.from_location_str('join(311,400..854)') self.assertTrue(location.intersects(Location.from_location((311, ))), "Location doesn't contain target point") location = Location.from_location_str('join(14424..14857,1)') self.assertTrue(location.intersects(Location.from_location((1, ))), "Location doesn't contain target point")
def testSingleLocationWithTolerance(self): location = Location.from_location_str( 'join(<10,61..86,162..203,264..318,388..>495)', tolerance=10) self.assertTrue(location.intersects(Location.from_location((5, ))), "Location doesn't contain target point") location = Location.from_location_str('join(<1..129,>657)', tolerance=10) self.assertTrue(location.intersects(Location.from_location((660, ))), "Location doesn't contain target point")
def testLocationMinimum(self): l1 = Location.from_location_str('join(1..10,11..50)') l2 = Location.from_location_str('complement(15..20)') l3 = Location.from_location_str('REF2:5..10') l4 = Location.from_location_str('complement(join(1..10,11..50))') l5 = Location.from_location_str('complement(join(15..20,1..2))') self.assertEqual(l1.min(), 1, 'Minimum should be 1') self.assertEqual(l2.min(), 15, 'Minimum should be 15') self.assertEqual(l3.min(), 5, 'Minimum should be 5') self.assertEqual(l4.min(), 1, 'Minimum should be 1') self.assertEqual(l5.min(), 1, 'Minimum should be 1')
def testLocationOverlap(self): l1 = Location.from_location_str('join(1..10,11..50)') l2 = Location.from_location_str('complement(join(1..2,15..20))') self.assertEqual(1, l1.start, "Start should be 1") self.assertEqual(50, l1.end, "End should be 50") self.assertEqual(1, l2.start, "Start should be 1") self.assertEqual(20, l2.end, "End should be 20") self.assertTrue(l1.overlaps(l2, use_complement=False), 'Locations should overlap') self.assertTrue(l1.overlaps(l2, use_complement=True), "Locations shouldn't overlap")
def __find_first_overlapping_CDS_id (self, aln_location, cdss): ''' Find id of the first CDS which overlaps with the given alignment. Uses binary search algorithm. @param (Location) aln_location Alignment Location @param [Cds] cdss List of CDSs, sorted by start @returns (int|None) Id in cdss of described CDS, None if no overlap ''' lo = 0 hi = len(cdss) - 1 # If cdss is empty -> return None if (hi < 0): return None while (lo < hi): mid = lo + (hi - lo) // 2 # '//' for python 3 compatibility cds_rel_pos = self.__get_cds_rel_pos (cdss, mid, aln_location) if (cds_rel_pos == "LEFT_OF_ALN"): lo = mid + 1 if (cds_rel_pos == "RIGHT_OF_ALN"): hi = mid - 1 if (cds_rel_pos == "FIRST"): return mid # Check lo cds_location = Location.from_location_str(cdss[lo].location) if self. __overlap(cds_location, aln_location): return lo else: return None
def __find_first_overlapping_CDS_id(self, aln_location, cdss): ''' Find id of the first CDS which overlaps with the given alignment. Uses binary search algorithm. @param (Location) aln_location Alignment Location @param [Cds] cdss List of CDSs, sorted by start @returns (int|None) Id in cdss of described CDS, None if no overlap ''' lo = 0 hi = len(cdss) - 1 # If cdss is empty -> return None if (hi < 0): return None while (lo < hi): mid = lo + (hi - lo) // 2 # '//' for python 3 compatibility cds_rel_pos = self.__get_cds_rel_pos(cdss, mid, aln_location) if (cds_rel_pos == "LEFT_OF_ALN"): lo = mid + 1 if (cds_rel_pos == "RIGHT_OF_ALN"): hi = mid - 1 if (cds_rel_pos == "FIRST"): return mid # Check lo cds_location = Location.from_location_str(cdss[lo].location) if self.__overlap(cds_location, aln_location): return lo else: return None
def get_cds_location(self): '''Returns Location object of the associated CDS. Args: None Returns: (Location): Location of the associated CDS. ''' return Location.from_location_str(self.cds.location)
def testIntersectionLocation(self): # Test no intersection case loc1 = Location.from_location_str('complement(50..100)') self.assertEqual( None, loc1.find_intersection( Location.from_location_str('complement(200..500)'))) self.assertEqual( None, loc1.find_intersection( Location.from_location_str('complement(1..40)'))) self.assertEqual( None, loc1.find_intersection( Location.from_location_str('complement(1..49)'))) self.assertEqual( None, loc1.find_intersection( Location.from_location_str('complement(101..150)'))) # Test simple one interval intersection loc1 = Location.from_location_str('10..100') loc2 = Location.from_location_str('50..100') intersection = loc1.find_intersection(loc2) self.assertEqual(loc2.start, intersection.start, "Start intersection position doesn't match") self.assertEqual(loc2.end, intersection.end, "End intersection position doesn't match") self.assertEqual(loc2.complement, intersection.complement, "Complement intersection information doesn't match") # Test multiple interval intersection loc1 = Location.from_location_str('join(1..40,60..80,120..200)') aln_location = Location.from_location_str('30..130') loc2 = Location.from_location_str('join(30..40,60..80,120..130)') intersection = loc1.find_intersection(aln_location) for subint, subl2 in zip(intersection.sublocations, loc2.sublocations): self.assertEqual(subl2.start, subint.start, "Start intersection position doesn't match") self.assertEqual(subl2.end, subint.end, "End intersection position doesn't match") self.assertEqual(loc2.complement, intersection.complement, "Complement intersection information doesn't match")
def _calc_coverage(self, cds_aln): """ Calculates coverage of given cds alignment. Coverage is calculated as sum of lengths of aligned regions divided by length of cds. @param (CdsAlignment) cds_aln @return (float) coverage """ # Aligned region is part of a read that intersects with cds. coverage = 0 for aln_reg in cds_aln.aligned_regions.values(): # aln_reg is of type CdsAlnSublocation location = aln_reg.location # location is of type Location coverage += location.length() coverage = coverage / float(Location.from_location_str(cds_aln.cds.location).length()) return coverage
def calc_cds_coverage(cds_aln): """ Calculates coverage of cds. Coverage is average number of reads per base of cds. @param (CdsAlignment) cds_aln @return (float) Cds coverage. """ coverage = 0 for aln_reg in cds_aln.aligned_regions.values( ): # aln_reg is of type CdsAlnSublocation location = aln_reg.location # location is of type Location coverage += location.length() coverage = coverage / float( Location.from_location_str(cds_aln.cds.location).length()) return coverage
def testIntersectionsWithTolerance(self): location = Location.from_location_str( "complement(join(<197..1301,2070..>2451))", tolerance=100) self.assertFalse(location.intersects(Location.from_location((100, )))) self.assertTrue( location.intersects( Location.from_location((100, ), complement=True))) self.assertFalse(location.intersects(Location.from_location((300, )))) self.assertTrue( location.intersects( Location.from_location((300, ), complement=True))) self.assertFalse(location.intersects(Location.from_location( (50, 100)))) self.assertTrue( location.intersects( Location.from_location((50, 100), complement=True))) self.assertFalse( location.intersects(Location.from_location((300, 400)))) self.assertTrue( location.intersects( Location.from_location((300, 400), complement=True))) self.assertFalse( location.intersects(Location.from_location((1200, 1400)))) self.assertTrue( location.intersects( Location.from_location((1200, 1400), complement=True))) self.assertFalse( location.intersects(Location.from_location((2500, 2600)))) self.assertTrue( location.intersects( Location.from_location((2500, 2600), complement=True))) self.assertFalse( location.intersects(Location.from_location((3000, 4000)))) self.assertFalse( location.intersects( Location.from_location((3000, 4000), complement=True)))
def _calc_coverage(self, cds_aln): """ Calculates coverage of given cds alignment. Coverage is calculated as sum of lengths of aligned regions divided by length of cds. @param (CdsAlignment) cds_aln @return (float) coverage """ # Aligned region is part of a read that intersects with cds. coverage = 0 for aln_reg in cds_aln.aligned_regions.values( ): # aln_reg is of type CdsAlnSublocation location = aln_reg.location # location is of type Location coverage += location.length() coverage = coverage / float( Location.from_location_str(cds_aln.cds.location).length()) return coverage
def testParsesMultisegmentLocation(self): location = Location.from_location_str( 'join(AF178221.1:<1..60,AF178222.1:1..63,AF178223.1:1..42, 1..>90)' ) self.assertTrue(location.intersects(Location.from_location((80, ))), "Location doesn't contain target point") self.assertIn('AF178221.1', location.references(), 'Reference AF178221.1 not parsed') self.assertIn('AF178222.1', location.references(), 'Reference AF178222.1 not parsed') self.assertIn('AF178223.1', location.references(), 'Reference AF178223.1 not parsed') self.assertTrue( len(location.references()) == 3, 'Wrong number of references')
def determine_coding_seqs(self, record_container): ''' Determines which of the CDSs in the record aligned_regions aligned to the read. @return list of tuples (cds, intersecting_location) if such exist, None if record is not available from the database ''' self.aligned_cdss = [] record = record_container.fetch_record(self.nucleotide_accession) # if not possible to fetch a record from the db, return None if not record: return None (start, stop) = self.location_span try: location = Location.from_location_str("%d..%d" % (start, stop)) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss
def determine_coding_seqs (self, record_container): ''' Determines which of the CDSs in the record aligned_regions aligned to the read. @return list of tuples (cds, intersecting_location) if such exist, None if record is not available from the database ''' self.aligned_cdss = [] record = record_container.fetch_record (self.nucleotide_accession) # if not possible to fetch a record from the db, return None if not record: return None (start,stop) = self.location_span try: location = Location.from_location_str("%d..%d" % (start, stop)) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss
def add_cds_to_organism(organism, read, target_alignment): target_cdss = target_alignment.aligned_cdss assert(len(target_cdss) >= 1) binned_read = resdata.BinnedRead(read.id) if len(target_cdss) == 1: # do stuffs (target_cds, intersection) = target_cdss[0] else: cdss = [] for (cds, intersection) in target_cdss: cdss.append(cds) sorted_cdss = sorted(cdss, key = lambda cds: Location.from_location_str(cds.location).length()) target_cds = sorted_cdss[-1] if organism.contains_identified_coding_region(target_cds): identified_cds = organism.identified_coding_regions[target_cds] identified_cds.add_binned_read(binned_read) else: identified_cds = resdata.IdentifiedCds(target_cds) identified_cds.add_binned_read(binned_read) organism.add_identified_coding_region(identified_cds)
def determine_coding_seqs_optimal (self, record): ''' Determines which of the CDSs in the record aligned_regions aligned to the read. @param (UnityRecord) record Record that is used @return list of tuples (cds, intersecting_location) if such exist, None if record is not available from the database ''' self.aligned_cdss = [] # If not possible to fetch a record from the db, return None if not record: return None # Acquire alignment Location (start, stop) = self.location_span try: aln_location = Location.from_location_str("%d..%d" % (start, stop)) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss
def determine_coding_seqs_optimal(self, record): ''' Determines which of the CDSs in the record aligned_regions aligned to the read. @param (UnityRecord) record Record that is used @return list of tuples (cds, intersecting_location) if such exist, None if record is not available from the database ''' self.aligned_cdss = [] # If not possible to fetch a record from the db, return None if not record: return None # Acquire alignment Location (start, stop) = self.location_span try: aln_location = Location.from_location_str("%d..%d" % (start, stop)) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss
class ReadAlnLocation(object): """ Contains information on alignment location on an NT nucleotide string """ def __init__(self, read_id, nucleotide_accession, db_source, genome_index, score, location_span, complement, active=True): self.read_id = read_id self.nucleotide_accession = nucleotide_accession self.db_source = db_source self.genome_index = genome_index self.score = score self.location_span = location_span self.complement = complement self.active = active # self.determine_coding_seqs() # Sto je sa .aligned_cdss? Navesti to negdje u komentarima ako postoji! def set_active(self, active): ''' Sets active status for the read alignment. Inactive reads do not go into CDS alignments. ''' self.active = active def set_potential_host_status(self, potential_host): ''' Set to true if organism is potential host [child of animalia kingdom] @param potential_host (boolean) ''' self.potential_host = potential_host def is_potential_host(self): """ Returns true if organism is potential host (child of animalia kingdom), false otherwise. @return (boolean) """ return self.potential_host def determine_coding_seqs(self, record_container): ''' Determines which of the CDSs in the record aligned_regions aligned to the read. @return list of tuples (cds, intersecting_location) if such exist, None if record is not available from the database ''' self.aligned_cdss = [] record = record_container.fetch_record(self.nucleotide_accession) # if not possible to fetch a record from the db, return None if not record: return None (start, stop) = self.location_span try: location = Location.from_location_str("%d..%d" % (start, stop)) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss for cds in record.cds: try: cds_location = Location.from_location_str(cds.location) except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e continue location_intersection = cds_location.find_intersection(location) if location_intersection is not None: self.aligned_cdss.append((cds, location_intersection))
except LoactionParsingException, e: print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss # Determine first overlapping CDS - binary search first_ovp_id = self.__find_first_overlapping_CDS_id (aln_location, record.cds) # No CDS from the list overlaps - return [] if (first_ovp_id == None): return self.aligned_cdss # Determine following overlapping CDSs - loop while overlaps for i in range(first_ovp_id, len(record.cds)): cds = record.cds[i] cds_location = Location.from_location_str(cds.location) # If this one does not overlap, the others also won't because it's sorted if not self.__overlap(cds_location, aln_location): break location_intersection = cds_location.find_intersection (aln_location) if location_intersection is not None: self.aligned_cdss.append ((cds, location_intersection)) return self.aligned_cdss # ---------------------------------------------------------------------------- # def set_type (self): """ Location can be coding or non-coding
def testParseReferenceLocation(self): location = Location.from_location_str('REFERENCE:1..10') self.assertTrue(location.intersects(Location.from_location((5, 15))))
def matches(self, location, complement, tolerance): l1 = Location.from_location_str(self.location, tolerance) return l1.intersects(Location.from_location(location, complement))
def testParseOrderLocation(self): location = Location.from_location_str('order(1..3,4..6)') self.assertTrue(location.intersects(Location.from_location((3, 4))), "Location doesn't contain target point")
def testParsesLocationWithSpaces(self): location = Location.from_location_str( 'join(620..987, 1010..1170,1194..1443)') self.assertTrue(location.intersects(Location.from_location((1010, ))), "Location doesn't contain target point")
def testLocationContains(self): l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('5..10') self.assertTrue(l1.contains(l2), '1..10 should contain 5..10') l1 = Location.from_location_str('complement(<23..50)') l2 = Location.from_location_str('complement(24..50)') self.assertTrue( l1.contains(l2), 'complement(<23..50) should contain ' 'complement(24..50)') l1 = Location.from_location_str('join(1..10,20..30)') l2 = Location.from_location_str('join(2..8,25..28)') self.assertTrue( l1.contains(l2), 'join(1..10,20..30) should contain ' 'join(2..8,25..28)') l1 = Location.from_location_str( 'join(complement(1..10),complement(20..30))') l2 = Location.from_location_str( 'join(complement(2..8),complement(25..28))') self.assertTrue( l1.contains(l2), 'join(complement(1..10),complement(20..30)) should contain ' 'join(complement(2..8),complement(25..28))') l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('4..20') self.assertFalse(l1.contains(l2), '1..10 should not contain ' '4..20') l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('15..20') self.assertFalse(l1.contains(l2), '1..10 should not contain ' '15..20') l1 = Location.from_location_str('complement(1..10)') l2 = Location.from_location_str('complement(4..20)') self.assertFalse( l1.contains(l2), 'complement(1..10) should not contain ' 'complement(4..20)') l1 = Location.from_location_str('complement(1..10)') l2 = Location.from_location_str('complement(15..20)') self.assertFalse( l1.contains(l2), 'complement(1..10) should not contain ' 'complement(15..20)') l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('complement(1..10)') self.assertFalse(l1.contains(l2), '1..10 should not contain ' 'complement(1..10)') l1 = Location.from_location_str('1..10') l2 = Location.from_location_str('complement(15..20)') self.assertFalse(l1.contains(l2), '1..10 should not contain ' 'complement(15..20)') l1 = Location.from_location_str('join(1..10,11..50)') l2 = Location.from_location_str('15..20') self.assertTrue(l1.contains(l2), 'join(1..10,11..50) should contain ' '15..20') l1 = Location.from_location_str('join(1..10,11..50)') l2 = Location.from_location_str('complement(15..20)') self.assertTrue( l1.contains(l2, use_complement=False), 'join(1..10,11..50) should contain ' 'complement(15..20) without complement information') l1 = Location.from_location_str('REF1:1..10') l2 = Location.from_location_str('5..10') self.assertFalse(l1.contains(l2), 'REF1:1..10 should contain 5..10') l1 = Location.from_location_str('REF1:1..10') l2 = Location.from_location_str('REF2:5..10') self.assertFalse(l1.contains(l2), '1..10 should contain 5..10')
print "ReadAlignment/determine_coding_seqs:", e self.aligned_cdss = [] return self.aligned_cdss # Determine first overlapping CDS - binary search first_ovp_id = self.__find_first_overlapping_CDS_id( aln_location, record.cds) # No CDS from the list overlaps - return [] if (first_ovp_id == None): return self.aligned_cdss # Determine following overlapping CDSs - loop while overlaps for i in range(first_ovp_id, len(record.cds)): cds = record.cds[i] cds_location = Location.from_location_str(cds.location) # If this one does not overlap, the others also won't because it's sorted if not self.__overlap(cds_location, aln_location): break location_intersection = cds_location.find_intersection( aln_location) if location_intersection is not None: self.aligned_cdss.append((cds, location_intersection)) return self.aligned_cdss # ---------------------------------------------------------------------------- # def set_type(self):