def from_location( location: Location, cds_frames: List[Union[CDSFrame, CDSPhase]], sequence_guid: Optional[UUID] = None, sequence_name: Optional[str] = None, protein_id: Optional[str] = None, product: Optional[str] = None, qualifiers: Optional[Dict[Hashable, QualifierValue]] = None, guid: Optional[UUID] = None, ) -> "CDSInterval": """A convenience function that allows for construction of a :class:`CDSInterval` from a location object, a list of CDSFrames or CDSPhase, and optional metadata.""" if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): raise NoSuchAncestorException( "Cannot call from_location with a chunk-relative location. Use from_chunk_relative_location()." ) return CDSInterval( cds_starts=[x.start for x in location.blocks], cds_ends=[x.end for x in location.blocks], strand=location.strand, frames_or_phases=cds_frames, sequence_guid=sequence_guid, sequence_name=sequence_name, protein_id=protein_id, product=product, qualifiers=qualifiers, guid=guid, parent_or_seq_chunk_parent=location.parent, )
def liftover_to_parent_or_seq_chunk_parent( self, parent_or_seq_chunk_parent: Parent, ) -> "AbstractInterval": """ This function returns a copy of this interval lifted over to a new coordinate system. If this interval is already in chunk-relative coordinates, it is first lifted back up the chromosome coordinates before the liftover occurs. This means that there *must* be a Parent somewhere in the ancestry with type "chromosome", and that Parent must match the supplied parent except for location information. Validation has to happen here in addition to in ``liftover_location_to_seq_chunk_parent()``, because at this point the parent of this current interval is still known. Once the ``to_dict()`` operation is performed, this information is list, and the new parent is applied under the assumption that it is valid. """ if self.chunk_relative_location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): if not self.chunk_relative_location.has_ancestor_of_type(SequenceType.CHROMOSOME): raise NoSuchAncestorException( "This location does not have a chromosome ancestor of its sequence chunk, " "which means it is not possible to lift to a new a chunk through the chromosome coordinates." ) if self.chunk_relative_location.has_ancestor_of_type(SequenceType.CHROMOSOME): loc_chrom = self.chunk_relative_location.first_ancestor_of_type(SequenceType.CHROMOSOME) par_chrom = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.CHROMOSOME) if loc_chrom.sequence and par_chrom.sequence: ObjectValidation.require_parents_equal_except_location(loc_chrom, par_chrom) else: ObjectValidation.require_parents_equal_except_location_and_sequence(loc_chrom, par_chrom) return self.from_dict(self.to_dict(), parent_or_seq_chunk_parent)
def first_ancestor_of_type( self, sequence_type: Union[str, SequenceType]) -> Parent: """Returns the Parent object representing the closest ancestor (parent, parent of parent, etc.) of this location which has the given sequence type. Raises NoSuchAncestorException if no ancestor with the given type exists.""" if not self.parent: raise NoSuchAncestorException("Location has no parent") return self.parent.first_ancestor_of_type(sequence_type, include_self=True)
def lift_over_to_sequence(self, sequence: Sequence) -> "Location": """Returns a new Location representing the liftover of this Location to the given sequence. The given sequence must be equal to the sequence attribute of some Parent in the ancestor hierarchy of this Location; otherwise, raises NoSuchAncestorException.""" if not self.is_contiguous: raise ValueError("Location must be contiguous") if not self.has_ancestor_sequence(sequence): raise NoSuchAncestorException( "\nLocation:\n{}\nDoes not have ancestor:\n{}".format( str(self), sequence.summary())) if self.parent.sequence and self.parent.sequence == sequence: return self lifted_to_grandparent = self.parent.lift_child_location_to_parent() return lifted_to_grandparent.lift_over_to_sequence(sequence)
def from_chunk_relative_location( location: Location, cds_frames: List[Union[CDSFrame, CDSPhase]], sequence_guid: Optional[UUID] = None, sequence_name: Optional[str] = None, protein_id: Optional[str] = None, product: Optional[str] = None, qualifiers: Optional[Dict[Hashable, QualifierValue]] = None, guid: Optional[UUID] = None, ) -> "CDSInterval": """ Allows construction of a TranscriptInterval from a chunk-relative location. This is a location present on a sequence chunk, which could be a sequence produced This location should be built by something like this: .. code-block:: python from inscripta.biocantor.io.parser import seq_chunk_to_parent parent = seq_chunk_to_parent('AANAAATGGCGAGCACCTAACCCCCNCC', "NC_000913.3", 222213, 222241) loc = SingleInterval(5, 20, Strand.PLUS, parent=parent) And then, this can be lifted back to chromosomal coordinates like such: .. code-block:: python loc.lift_over_to_first_ancestor_of_type("chromosome") """ if not location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): raise NoSuchAncestorException( "Must have a sequence chunk in the parent hierarchy.") chromosome_location = location.lift_over_to_first_ancestor_of_type( SequenceType.CHROMOSOME) return CDSInterval( cds_starts=[x.start for x in chromosome_location.blocks], cds_ends=[x.end for x in chromosome_location.blocks], strand=chromosome_location.strand, frames_or_phases=cds_frames, sequence_guid=sequence_guid, sequence_name=sequence_name, protein_id=protein_id, product=product, qualifiers=qualifiers, guid=guid, parent_or_seq_chunk_parent=location.parent, )
def lift_over_to_first_ancestor_of_type( self, sequence_type: Union[str, SequenceType]) -> "Location": """Returns a new Location representing the liftover of this Location to its closest ancestor sequence (parent, parent of parent, etc.) which has the given sequence type. If the immediate parent has the given type, returns this Location. Raises NoSuchAncestorException if no ancestor with the given type exists.""" try: self.first_ancestor_of_type(sequence_type) except NoSuchAncestorException: raise NoSuchAncestorException( "Location has no ancestor of type {}".format(sequence_type)) if self.parent_type == sequence_type: return self lifted_to_grandparent = self.parent.lift_child_location_to_parent() return lifted_to_grandparent.lift_over_to_first_ancestor_of_type( sequence_type)
def from_location( location: Location, cds: Optional[CDSInterval] = None, qualifiers: Optional[Dict[Hashable, QualifierValue]] = None, is_primary_tx: Optional[bool] = None, transcript_id: Optional[str] = None, transcript_symbol: Optional[str] = None, transcript_type: Optional[Biotype] = None, sequence_guid: Optional[UUID] = None, sequence_name: Optional[str] = None, protein_id: Optional[str] = None, product: Optional[str] = None, guid: Optional[UUID] = None, transcript_guid: Optional[UUID] = None, ) -> "TranscriptInterval": if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): raise NoSuchAncestorException( "Cannot call from_location with a chunk-relative location. Use from_chunk_relative_location()." ) return TranscriptInterval( exon_starts=[x.start for x in location.blocks], exon_ends=[x.end for x in location.blocks], strand=location.strand, cds_starts=cds._genomic_starts if cds else None, cds_ends=cds._genomic_ends if cds else None, cds_frames=cds.frames if cds else None, guid=guid, transcript_guid=transcript_guid, qualifiers=qualifiers, is_primary_tx=is_primary_tx, transcript_id=transcript_id, transcript_symbol=transcript_symbol, transcript_type=Biotype[transcript_type] if transcript_type else None, sequence_name=sequence_name, sequence_guid=sequence_guid, protein_id=protein_id, product=product, parent_or_seq_chunk_parent=location.parent, )
def to_gff( self, parent: Optional[str] = None, parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None, chromosome_relative_coordinates: bool = True, raise_on_reserved_attributes: Optional[bool] = True, ) -> Iterable[GFFRow]: """Writes a GFF format list of lists for this CDS. The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files are easier to work with if the children features have the qualifiers of their parents. Args: parent: ID of the Parent of this transcript. parent_qualifiers: Directly pull qualifiers in from this dictionary. chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception if there is not a ``sequence_chunk`` ancestor type. raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present in the qualifiers will lead to an exception and not a warning. Yields: :class:`~biocantor.io.gff3.rows.GFFRow` Raises: NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no ``sequence_chunk`` ancestor type. GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript. """ if not chromosome_relative_coordinates and not self.has_ancestor_of_type( SequenceType.SEQUENCE_CHUNK): raise NoSuchAncestorException( "Cannot export GFF in relative coordinates without a sequence_chunk ancestor." ) qualifiers = self.export_qualifiers(parent_qualifiers) cds_guid = str(self.guid) if chromosome_relative_coordinates: cds_blocks = zip(self._genomic_starts, self._genomic_ends) frames = self.frames else: cds_blocks = [[x.start, x.end] for x in self.chunk_relative_blocks] frames = self.chunk_relative_frames for i, block, frame in zip(count(1), cds_blocks, frames): start, end = block attributes = GFFAttributes( id=f"{cds_guid}-{i}", qualifiers=qualifiers, name=self.protein_id, parent=parent, raise_on_reserved_attributes=raise_on_reserved_attributes, ) row = GFFRow( self.sequence_name, GFF_SOURCE, BioCantorFeatureTypes.CDS, start + 1, end, NULL_COLUMN, self.strand, frame.to_phase(), attributes, ) yield row
def to_gff( self, parent: Optional[str] = None, parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None, chromosome_relative_coordinates: bool = True, raise_on_reserved_attributes: Optional[bool] = True, ) -> Iterable[GFFRow]: """Writes a GFF format list of lists for this transcript. The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files are easier to work with if the children features have the qualifiers of their parents. Args: parent: ID of the Parent of this transcript. parent_qualifiers: Directly pull qualifiers in from this dictionary. chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception if there is not a ``sequence_chunk`` ancestor type. raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present in the qualifiers will lead to an exception and not a warning. Yields: :class:`~biocantor.io.gff3.rows.GFFRow` Raises: NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no ``sequence_chunk`` ancestor type. GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript. """ if not self.sequence_name: raise GFF3MissingSequenceNameError("Must have sequence names to export to GFF3.") if not chromosome_relative_coordinates and not self.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): raise NoSuchAncestorException( "Cannot export GFF in relative coordinates without a sequence_chunk ancestor." ) qualifiers = self.export_qualifiers(parent_qualifiers) tx_guid = str(self.guid) attributes = GFFAttributes( id=tx_guid, qualifiers=qualifiers, name=self.transcript_symbol, parent=parent, raise_on_reserved_attributes=raise_on_reserved_attributes, ) # transcript feature row = GFFRow( self.sequence_name, GFF_SOURCE, BioCantorFeatureTypes.TRANSCRIPT, (self.start if chromosome_relative_coordinates else self.chunk_relative_start) + 1, self.end if chromosome_relative_coordinates else self.chunk_relative_end, NULL_COLUMN, self.strand, CDSPhase.NONE, attributes, ) yield row # start adding exon features # re-use qualifiers, updating ID each time if chromosome_relative_coordinates: blocks = zip(self._genomic_starts, self._genomic_ends) else: blocks = [[x.start, x.end] for x in self.relative_blocks] for i, (start, end) in enumerate(blocks, 1): attributes = GFFAttributes( id=f"exon-{tx_guid}-{i}", qualifiers=qualifiers, name=self.transcript_symbol, parent=tx_guid, raise_on_reserved_attributes=raise_on_reserved_attributes, ) row = GFFRow( self.sequence_name, GFF_SOURCE, BioCantorFeatureTypes.EXON, start + 1, end, NULL_COLUMN, self.strand, CDSPhase.NONE, attributes, ) yield row if self.cds: yield from self.cds.to_gff( chromosome_relative_coordinates=chromosome_relative_coordinates, parent_qualifiers=qualifiers, parent=tx_guid, raise_on_reserved_attributes=raise_on_reserved_attributes, )
def liftover_location_to_seq_chunk_parent( location: Location, parent_or_seq_chunk_parent: Optional[Parent] = None, ) -> Location: """ BioCantor supports constructing any of the interval classes from a subset of the chromosome. In order to be able to set up the coordinate relationship and successfully pull down sequence, this function lifts the coordinates from the original annotation object on to this new coordinate system. .. code:: python parent_1_15 = Parent( sequence=Sequence( genome2[1:15], Alphabet.NT_EXTENDED_GAPPED, type=SequenceType.SEQUENCE_CHUNK, parent=Parent( location=SingleInterval(1, 15, Strand.PLUS, parent=Parent(id="genome_1_15", sequence_type=SequenceType.CHROMOSOME)) ), ) ) Alternatively, if the sequence is coming straight from a file, it will be a :class:`Parent` with a :class:`Sequence` attached: .. code:: python parent = Parent(id="chr1", sequence=Sequence(genome, Alphabet.NT_STRICT, type=SequenceType.CHROMOSOME)) This convenience function detects which kind of parent is given, and sets up the appropriate location. This function also handles the case where the ``location`` argument is already chunk-relative. If this is the case, the ``location`` object is first lifted back to its chromosomal coordinates, then lifted back down on to this new chunk. Args: location: A location object, likely produced by :meth:`initialize_location()`. Could also be the location of an existing AbstractInterval subclass, such as when the method ``liftover_interval_to_parent_or_seq_chunk_parent()`` is called. parent_or_seq_chunk_parent: An optional parent, either as a full chromosome or as a sequence chunk. If not provided, this function is a no-op. Returns: A :class:`Location` object. Raises: ValidationException: If ``parent_or_seq_chunk_parent`` has no ancestor of type ``chromosome`` or ``sequence_chunk``. NullSequenceException: If ``parent_or_seq_chunk_parent`` has no usable sequence ancestor. NoSuchAncestorException: If ``location`` has a ``sequence_chunk`` ancestor, but no ``chromosome`` ancestor. Such a relationship is required to lift from one chunk to a new chunk. """ if parent_or_seq_chunk_parent is None: return location # if we are already a subset, we need to first lift back to genomic coordinates before lifting to this chunk if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): if not location.has_ancestor_of_type(SequenceType.CHROMOSOME): raise NoSuchAncestorException( "This location does not have a chromosome ancestor of its sequence chunk, " "which means it is not possible to lift to a new a chunk through the chromosome coordinates." ) # ensure that both chromosomes are the same chromosome loc_chrom = location.first_ancestor_of_type(SequenceType.CHROMOSOME) par_chrom = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.CHROMOSOME) if loc_chrom.sequence and par_chrom.sequence: ObjectValidation.require_parents_equal_except_location(loc_chrom, par_chrom) else: ObjectValidation.require_parents_equal_except_location_and_sequence(loc_chrom, par_chrom) location = location.lift_over_to_first_ancestor_of_type(SequenceType.CHROMOSOME).reset_parent( parent_or_seq_chunk_parent.parent ) if parent_or_seq_chunk_parent.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK): if not parent_or_seq_chunk_parent.has_ancestor_of_type(SequenceType.CHROMOSOME): raise NoSuchAncestorException( "Must have a chromosome in the hierarchy if a sequence chunk is provided." ) chunk_parent = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.SEQUENCE_CHUNK) if not chunk_parent.sequence: raise NullSequenceException("Must have a sequence if a sequence chunk parent is provided.") location = location.reset_parent(chunk_parent.parent) sequence_chunk = chunk_parent.sequence # do not optimize blocks here -- this retains adjacent CDS intervals try: interval_location_rel_to_chunk = sequence_chunk.location_on_parent.parent_to_relative_location( location, optimize_blocks=False ) except LocationOverlapException: # the positions associated with this Location do not overlap the sequence chunk. However, # the chromosome location information can still be retained, but there is inherently no sequence # information. return EmptyLocation() interval_rel_to_chunk = interval_location_rel_to_chunk.reset_parent(parent_or_seq_chunk_parent) return interval_rel_to_chunk # since this is a whole genome (or something unknown), we don't need to lift anything up return location.reset_parent(parent_or_seq_chunk_parent)