def _get_repeat_features(self, db, klass, target_coord, query_coord, where_feature): """returns Repeat region instances""" # we build repeats using coordinates from repeat_feature table # the repeat_consensus_id is required to get the repeat name, class # and type repeat_feature_table = db.getTable('repeat_feature') query = sql.select( [repeat_feature_table], repeat_feature_table.c.seq_region_id == query_coord.seq_region_id) query = location_query(repeat_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) for record in query.execute(): coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End=record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand=record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one( get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'], data=record)
def _get_simple_features(self, db, klass, target_coord, query_coord, where_feature): """returns feature_type records for the query_coord from the simple_feature table. The returned coord is referenced to target_coord. At present, only CpG islands being queried.""" simple_feature_table = db.getTable('simple_feature') feature_types = ['CpGisland'] feature_type_ids=[str(self._feature_type_ids.get(f)) for f in feature_types] # fix the following query = sql.select([simple_feature_table], sql.and_(simple_feature_table.c.analysis_id.in_(feature_type_ids), simple_feature_table.c.seq_region_id == query_coord.seq_region_id)) query = location_query(simple_feature_table,query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) records = query.execute() for record in records: coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End = record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand = record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'])
def _get_gene_features(self, db, klass, target_coord, query_coord, where_feature): """returns all genes""" xref_table = [None, db.getTable('xref')][db.Type == 'core'] gene_table = db.getTable('gene') # after release 65, the gene_id_table is removed. The following is to maintain # support for earlier releases. if self.GeneralRelease >= 65: gene_id_table = None else: gene_id_table = db.getTable('gene_stable_id') # note gene records are at chromosome, not contig, level condition = gene_table.c.seq_region_id == query_coord.seq_region_id query = self._build_gene_query(db, condition, gene_table, gene_id_table, xref_table) query = location_query(gene_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) for record in query.execute(): new = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End = record['seq_region_end'], Strand = record['seq_region_strand'], seq_region_id=record['seq_region_id'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # TODO: check coord, used 'new' here. where is coord (above line) used? gene = klass(self, db, Location=new, data=record) yield gene
def _get_repeat_features(self, db, klass, target_coord, query_coord, where_feature): """returns Repeat region instances""" # we build repeats using coordinates from repeat_feature table # the repeat_consensus_id is required to get the repeat name, class # and type repeat_feature_table = db.getTable("repeat_feature") query = sql.select([repeat_feature_table], repeat_feature_table.c.seq_region_id == query_coord.seq_region_id) query = location_query( repeat_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature ) for record in query.execute(): coord = Coordinate( self, CoordName=query_coord.CoordName, Start=record["seq_region_start"], End=record["seq_region_end"], seq_region_id=record["seq_region_id"], Strand=record["seq_region_strand"], ensembl_coord=True, ) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one(get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record["score"], data=record)
def getFeatures(self, region=None, feature_types=None, where_feature=None, CoordName=None, Start=None, End=None, Strand=None, ensembl_coord=False): """returns Region instances for the specified location""" if isinstance(feature_types, str): feature_types = [feature_types] feature_types = [ft.lower() for ft in feature_types] feature_coord_levels = self._get_feature_coord_levels(feature_types) if region is None: seq_region_id = self._get_seq_region_id(CoordName) region = Coordinate(self, CoordName=CoordName, Start=Start, End=End, Strand=convert_strand(Strand), seq_region_id=seq_region_id, ensembl_coord=ensembl_coord) elif hasattr(region, 'Location'): region = region.Location coord = region # the coordinate system at which locations are to be referenced, and # the processing function target_coords_funcs = \ dict(cpg = (self._get_simple_features, CpGisland), repeat = (self._get_repeat_features, Repeat), gene = (self._get_gene_features, Gene), est = (self._get_gene_features, Est), variation = (self._get_variation_features, Variation)) known_types = set(target_coords_funcs.keys()) if not set(feature_types) <= known_types: raise RuntimeError, 'Unknown feature[%s], valid feature_types \ are: %s' % (set(feature_types) ^ known_types, known_types) for feature_type in feature_types: target_func, target_class = target_coords_funcs[feature_type] db = self.CoreDb if feature_type == 'est': db = self.OtherFeaturesDb feature_coords = feature_coord_levels[feature_type].levels for feature_coord in feature_coords: chrom_other_coords = get_coord_conversion(coord, feature_coord, db, where=where_feature) for chrom_coord, other_coord in chrom_other_coords: for region in target_func(db, target_class, chrom_coord, other_coord, where_feature): yield region
def test_get_coord_conversion(self): """should correctly map between different coordinate levels""" # not really testing the contig coordinates are correct CoordName, Start, End, Strand = '1', 1000, 1000000, 1 human_loc = Coordinate(CoordName = CoordName, Start = Start, End = End, Strand = Strand, genome = human) results = get_coord_conversion(human_loc, 'contig', human.CoreDb) for result in results: self.assertTrue(result[0].CoordName == CoordName) self.assertTrue(result[0].Start >= Start) self.assertTrue(result[0].End <= End) self.assertTrue(result[0].Strand == Strand)
def test_get_coord_conversion(self): """should correctly map between different coordinate levels""" # not really testing the contig coordinates are correct CoordName, Start, End, Strand = '1', 1000, 1000000, 1 human_loc = Coordinate(CoordName=CoordName, Start=Start, End=End, Strand=Strand, genome=human) results = get_coord_conversion(human_loc, 'contig', human.CoreDb) for result in results: self.assertTrue(result[0].CoordName == CoordName) self.assertTrue(result[0].Start >= Start) self.assertTrue(result[0].End <= End) self.assertTrue(result[0].Strand == Strand)
def getFeatures(self, region=None, feature_types=None, where_feature=None, CoordName=None, Start=None, End=None, Strand=None, ensembl_coord=False): """returns Region instances for the specified location""" if isinstance(feature_types, str): feature_types = [feature_types] feature_types = [ft.lower() for ft in feature_types] feature_coord_levels = self._get_feature_coord_levels(feature_types) if region is None: seq_region_id = self._get_seq_region_id(CoordName) region = Coordinate(self,CoordName=CoordName, Start=Start, End=End, Strand = convert_strand(Strand), seq_region_id=seq_region_id, ensembl_coord=ensembl_coord) elif hasattr(region, 'Location'): region = region.Location coord = region # the coordinate system at which locations are to be referenced, and # the processing function target_coords_funcs = \ dict(cpg = (self._get_simple_features, CpGisland), repeat = (self._get_repeat_features, Repeat), gene = (self._get_gene_features, Gene), est = (self._get_gene_features, Est), variation = (self._get_variation_features, Variation)) known_types = set(target_coords_funcs.keys()) if not set(feature_types) <= known_types: raise RuntimeError, 'Unknown feature[%s], valid feature_types \ are: %s' % (set(feature_types)^known_types, known_types) for feature_type in feature_types: target_func, target_class = target_coords_funcs[feature_type] db = self.CoreDb if feature_type == 'est': db = self.OtherFeaturesDb feature_coords = feature_coord_levels[feature_type].levels for feature_coord in feature_coords: chrom_other_coords = get_coord_conversion(coord, feature_coord, db, where=where_feature) for chrom_coord, other_coord in chrom_other_coords: for region in target_func(db, target_class, chrom_coord, other_coord, where_feature): yield region
def _get_simple_features(self, db, klass, target_coord, query_coord, where_feature): """returns feature_type records for the query_coord from the simple_feature table. The returned coord is referenced to target_coord. At present, only CpG islands being queried.""" simple_feature_table = db.getTable('simple_feature') feature_types = ['CpGisland'] feature_type_ids = [ self._feature_type_ids.get(f) for f in feature_types ] # fix the following query = sql.select( [simple_feature_table], sql.and_( simple_feature_table.c.analysis_id.in_(feature_type_ids), simple_feature_table.c.seq_region_id == query_coord.seq_region_id)) query = location_query(simple_feature_table, query_coord.EnsemblStart, query_coord.EnsemblEnd, query=query, where=where_feature) records = query.execute() for record in records: coord = Coordinate(self, CoordName=query_coord.CoordName, Start=record['seq_region_start'], End=record['seq_region_end'], seq_region_id=record['seq_region_id'], Strand=record['seq_region_strand'], ensembl_coord=True) if query_coord.CoordName != target_coord.CoordName: coord = asserted_one( get_coord_conversion(coord, target_coord.CoordType, self.CoreDb))[1] # coord = coord.makeRelativeTo(query_coord) #TODO: fix here if query_coord and target_coord have different coordName # coord = coord.makeRelativeTo(target_coord, False) yield klass(self, db, Location=coord, Score=record['score'])
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False): # TODO clean up use of a coord genome = coord.genome # no matter what strand user provide, we get the + sequence first coord.Strand = 1 species = genome.Species coord_type = CoordSystem(species=species, core_db=genome.CoreDb, seq_level=True) if DEBUG: print('Created Coordinate:', coord, coord.EnsemblStart, coord.EnsemblEnd) print(coord.CoordType, coord_type) assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb) if not assemblies: raise NoItemError('no assembly for %s' % coord) dna = genome.CoreDb.getTable('dna') seqs, positions = [], [] for q_loc, t_loc in assemblies: assert q_loc.Strand == 1 length = len(t_loc) # get MySQL to do the string slicing via substr function query = sql.select([ substr(dna.c.sequence, t_loc.EnsemblStart, length).label('sequence') ], dna.c.seq_region_id == t_loc.seq_region_id) record = asserted_one(query.execute().fetchall()) seq = record['sequence'] seq = DNA.makeSequence(seq) if t_loc.Strand == -1: seq = seq.rc() seqs.append(str(seq)) positions.append((q_loc.Start, q_loc.End)) sequence = _assemble_seq(seqs, coord.Start, coord.End, positions) return sequence
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False): # TODO clean up use of a coord genome = coord.genome # no matter what strand user provide, we get the + sequence first coord.Strand = 1 species = genome.Species coord_type = CoordSystem(species=species,core_db=genome.CoreDb, seq_level=True) if DEBUG: print 'Created Coordinate:',coord,coord.EnsemblStart,coord.EnsemblEnd print coord.CoordType, coord_type assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb) if not assemblies: raise NoItemError, 'no assembly for %s' % coord dna = genome.CoreDb.getTable('dna') seqs, positions = [], [] for q_loc, t_loc in assemblies: assert q_loc.Strand == 1 length = len(t_loc) # get MySQL to do the string slicing via substr function query = sql.select([substr(dna.c.sequence, t_loc.EnsemblStart, length).label('sequence')], dna.c.seq_region_id == t_loc.seq_region_id) record = asserted_one(query.execute().fetchall()) seq = record['sequence'] seq = DNA.makeSequence(seq) if t_loc.Strand == -1: seq = seq.rc() seqs.append(str(seq)) positions.append((q_loc.Start, q_loc.End)) sequence = _assemble_seq(seqs, coord.Start, coord.End, positions) return sequence
def get_lower_coord_conversion(coord, species, core_db): coord_system = CoordSystem(species=species, core_db=core_db) seq_level_coord_type = CoordSystem(species=species,core_db=core_db, seq_level=True) query_rank = coord_system[coord.CoordType].rank seq_level_rank = coord_system[seq_level_coord_type].rank assemblies = None for rank in range(query_rank+1, seq_level_rank): coord_type = None for key in coord_system.keys(): if coord_system[key].rank == rank: coord_type = coord_system[key].name break if coord_type is None: continue assemblies = get_coord_conversion(coord, coord_type, core_db) if assemblies: break return assemblies
def get_lower_coord_conversion(coord, species, core_db): coord_system = CoordSystem(species=species, core_db=core_db) seq_level_coord_type = CoordSystem(species=species, core_db=core_db, seq_level=True) query_rank = coord_system[coord.CoordType].rank seq_level_rank = coord_system[seq_level_coord_type].rank assemblies = None for rank in range(query_rank + 1, seq_level_rank): coord_type = None for key in coord_system.keys(): if coord_system[key].rank == rank: coord_type = coord_system[key].name break if coord_type is None: continue assemblies = get_coord_conversion(coord, coord_type, core_db) if assemblies: break return assemblies