示例#1
0
    def from_location(
        location: Location,
        cds_frames: List[Union[CDSFrame, CDSPhase]],
        sequence_guid: Optional[UUID] = None,
        sequence_name: Optional[str] = None,
        protein_id: Optional[str] = None,
        product: Optional[str] = None,
        qualifiers: Optional[Dict[Hashable, QualifierValue]] = None,
        guid: Optional[UUID] = None,
    ) -> "CDSInterval":
        """A convenience function that allows for construction of a :class:`CDSInterval` from a location object,
        a list of CDSFrames or CDSPhase, and optional metadata."""
        if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            raise NoSuchAncestorException(
                "Cannot call from_location with a chunk-relative location. Use from_chunk_relative_location()."
            )

        return CDSInterval(
            cds_starts=[x.start for x in location.blocks],
            cds_ends=[x.end for x in location.blocks],
            strand=location.strand,
            frames_or_phases=cds_frames,
            sequence_guid=sequence_guid,
            sequence_name=sequence_name,
            protein_id=protein_id,
            product=product,
            qualifiers=qualifiers,
            guid=guid,
            parent_or_seq_chunk_parent=location.parent,
        )
示例#2
0
    def liftover_to_parent_or_seq_chunk_parent(
        self,
        parent_or_seq_chunk_parent: Parent,
    ) -> "AbstractInterval":
        """
        This function returns a copy of this interval lifted over to a new coordinate system. If this interval
        is already in chunk-relative coordinates, it is first lifted back up the chromosome coordinates before
        the liftover occurs. This means that there *must* be a Parent somewhere in the ancestry with
        type "chromosome", and that Parent must match the supplied parent except for location information.

        Validation has to happen here in addition to in ``liftover_location_to_seq_chunk_parent()``, because
        at this point the parent of this current interval is still known. Once the ``to_dict()`` operation is performed,
        this information is list, and the new parent is applied under the assumption that it is valid.
        """
        if self.chunk_relative_location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            if not self.chunk_relative_location.has_ancestor_of_type(SequenceType.CHROMOSOME):
                raise NoSuchAncestorException(
                    "This location does not have a chromosome ancestor of its sequence chunk, "
                    "which means it is not possible to lift to a new a chunk through the chromosome coordinates."
                )

        if self.chunk_relative_location.has_ancestor_of_type(SequenceType.CHROMOSOME):
            loc_chrom = self.chunk_relative_location.first_ancestor_of_type(SequenceType.CHROMOSOME)
            par_chrom = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.CHROMOSOME)
            if loc_chrom.sequence and par_chrom.sequence:
                ObjectValidation.require_parents_equal_except_location(loc_chrom, par_chrom)
            else:
                ObjectValidation.require_parents_equal_except_location_and_sequence(loc_chrom, par_chrom)

        return self.from_dict(self.to_dict(), parent_or_seq_chunk_parent)
示例#3
0
 def first_ancestor_of_type(
         self, sequence_type: Union[str, SequenceType]) -> Parent:
     """Returns the Parent object representing the closest ancestor (parent, parent of parent, etc.)
     of this location which has the given sequence type. Raises NoSuchAncestorException if no ancestor with
     the given type exists."""
     if not self.parent:
         raise NoSuchAncestorException("Location has no parent")
     return self.parent.first_ancestor_of_type(sequence_type,
                                               include_self=True)
示例#4
0
 def lift_over_to_sequence(self, sequence: Sequence) -> "Location":
     """Returns a new Location representing the liftover of this Location to the given sequence. The given
     sequence must be equal to the sequence attribute of some Parent in the ancestor hierarchy of this
     Location; otherwise, raises NoSuchAncestorException."""
     if not self.is_contiguous:
         raise ValueError("Location must be contiguous")
     if not self.has_ancestor_sequence(sequence):
         raise NoSuchAncestorException(
             "\nLocation:\n{}\nDoes not have ancestor:\n{}".format(
                 str(self), sequence.summary()))
     if self.parent.sequence and self.parent.sequence == sequence:
         return self
     lifted_to_grandparent = self.parent.lift_child_location_to_parent()
     return lifted_to_grandparent.lift_over_to_sequence(sequence)
示例#5
0
    def from_chunk_relative_location(
        location: Location,
        cds_frames: List[Union[CDSFrame, CDSPhase]],
        sequence_guid: Optional[UUID] = None,
        sequence_name: Optional[str] = None,
        protein_id: Optional[str] = None,
        product: Optional[str] = None,
        qualifiers: Optional[Dict[Hashable, QualifierValue]] = None,
        guid: Optional[UUID] = None,
    ) -> "CDSInterval":
        """
        Allows construction of a TranscriptInterval from a chunk-relative location. This is a location
        present on a sequence chunk, which could be a sequence produced

        This location should
        be built by something like this:

        .. code-block:: python

            from inscripta.biocantor.io.parser import seq_chunk_to_parent
            parent = seq_chunk_to_parent('AANAAATGGCGAGCACCTAACCCCCNCC', "NC_000913.3", 222213, 222241)
            loc = SingleInterval(5, 20, Strand.PLUS, parent=parent)

        And then, this can be lifted back to chromosomal coordinates like such:

        .. code-block:: python

            loc.lift_over_to_first_ancestor_of_type("chromosome")

        """
        if not location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            raise NoSuchAncestorException(
                "Must have a sequence chunk in the parent hierarchy.")

        chromosome_location = location.lift_over_to_first_ancestor_of_type(
            SequenceType.CHROMOSOME)
        return CDSInterval(
            cds_starts=[x.start for x in chromosome_location.blocks],
            cds_ends=[x.end for x in chromosome_location.blocks],
            strand=chromosome_location.strand,
            frames_or_phases=cds_frames,
            sequence_guid=sequence_guid,
            sequence_name=sequence_name,
            protein_id=protein_id,
            product=product,
            qualifiers=qualifiers,
            guid=guid,
            parent_or_seq_chunk_parent=location.parent,
        )
示例#6
0
 def lift_over_to_first_ancestor_of_type(
         self, sequence_type: Union[str, SequenceType]) -> "Location":
     """Returns a new Location representing the liftover of this Location to its closest ancestor sequence (parent,
     parent of parent, etc.) which has the given sequence type. If the immediate parent has the given type,
     returns this Location. Raises NoSuchAncestorException if no ancestor with the given type exists."""
     try:
         self.first_ancestor_of_type(sequence_type)
     except NoSuchAncestorException:
         raise NoSuchAncestorException(
             "Location has no ancestor of type {}".format(sequence_type))
     if self.parent_type == sequence_type:
         return self
     lifted_to_grandparent = self.parent.lift_child_location_to_parent()
     return lifted_to_grandparent.lift_over_to_first_ancestor_of_type(
         sequence_type)
示例#7
0
    def from_location(
        location: Location,
        cds: Optional[CDSInterval] = None,
        qualifiers: Optional[Dict[Hashable, QualifierValue]] = None,
        is_primary_tx: Optional[bool] = None,
        transcript_id: Optional[str] = None,
        transcript_symbol: Optional[str] = None,
        transcript_type: Optional[Biotype] = None,
        sequence_guid: Optional[UUID] = None,
        sequence_name: Optional[str] = None,
        protein_id: Optional[str] = None,
        product: Optional[str] = None,
        guid: Optional[UUID] = None,
        transcript_guid: Optional[UUID] = None,
    ) -> "TranscriptInterval":
        if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            raise NoSuchAncestorException(
                "Cannot call from_location with a chunk-relative location. Use from_chunk_relative_location()."
            )

        return TranscriptInterval(
            exon_starts=[x.start for x in location.blocks],
            exon_ends=[x.end for x in location.blocks],
            strand=location.strand,
            cds_starts=cds._genomic_starts if cds else None,
            cds_ends=cds._genomic_ends if cds else None,
            cds_frames=cds.frames if cds else None,
            guid=guid,
            transcript_guid=transcript_guid,
            qualifiers=qualifiers,
            is_primary_tx=is_primary_tx,
            transcript_id=transcript_id,
            transcript_symbol=transcript_symbol,
            transcript_type=Biotype[transcript_type] if transcript_type else None,
            sequence_name=sequence_name,
            sequence_guid=sequence_guid,
            protein_id=protein_id,
            product=product,
            parent_or_seq_chunk_parent=location.parent,
        )
示例#8
0
    def to_gff(
        self,
        parent: Optional[str] = None,
        parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
        chromosome_relative_coordinates: bool = True,
        raise_on_reserved_attributes: Optional[bool] = True,
    ) -> Iterable[GFFRow]:
        """Writes a GFF format list of lists for this CDS.

        The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
        are easier to work with if the children features have the qualifiers of their parents.

        Args:
            parent: ID of the Parent of this transcript.
            parent_qualifiers: Directly pull qualifiers in from this dictionary.
            chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
                if there is not a ``sequence_chunk`` ancestor type.
            raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
                in the qualifiers will lead to an exception and not a warning.

        Yields:
            :class:`~biocantor.io.gff3.rows.GFFRow`

        Raises:
            NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
            ``sequence_chunk`` ancestor type.
            GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
        """

        if not chromosome_relative_coordinates and not self.has_ancestor_of_type(
                SequenceType.SEQUENCE_CHUNK):
            raise NoSuchAncestorException(
                "Cannot export GFF in relative coordinates without a sequence_chunk ancestor."
            )

        qualifiers = self.export_qualifiers(parent_qualifiers)

        cds_guid = str(self.guid)

        if chromosome_relative_coordinates:
            cds_blocks = zip(self._genomic_starts, self._genomic_ends)
            frames = self.frames
        else:
            cds_blocks = [[x.start, x.end] for x in self.chunk_relative_blocks]
            frames = self.chunk_relative_frames

        for i, block, frame in zip(count(1), cds_blocks, frames):
            start, end = block
            attributes = GFFAttributes(
                id=f"{cds_guid}-{i}",
                qualifiers=qualifiers,
                name=self.protein_id,
                parent=parent,
                raise_on_reserved_attributes=raise_on_reserved_attributes,
            )
            row = GFFRow(
                self.sequence_name,
                GFF_SOURCE,
                BioCantorFeatureTypes.CDS,
                start + 1,
                end,
                NULL_COLUMN,
                self.strand,
                frame.to_phase(),
                attributes,
            )
            yield row
示例#9
0
    def to_gff(
        self,
        parent: Optional[str] = None,
        parent_qualifiers: Optional[Dict[Hashable, Set[str]]] = None,
        chromosome_relative_coordinates: bool = True,
        raise_on_reserved_attributes: Optional[bool] = True,
    ) -> Iterable[GFFRow]:
        """Writes a GFF format list of lists for this transcript.

        The additional qualifiers are used when writing a hierarchical relationship back to files. GFF files
        are easier to work with if the children features have the qualifiers of their parents.

        Args:
            parent: ID of the Parent of this transcript.
            parent_qualifiers: Directly pull qualifiers in from this dictionary.
            chromosome_relative_coordinates: Output GFF in chromosome-relative coordinates? Will raise an exception
                if there is not a ``sequence_chunk`` ancestor type.
            raise_on_reserved_attributes: If ``True``, then GFF3 reserved attributes such as ``ID`` and ``Name`` present
                in the qualifiers will lead to an exception and not a warning.

        Yields:
            :class:`~biocantor.io.gff3.rows.GFFRow`

        Raises:
            NoSuchAncestorException: If ``chromosome_relative_coordinates`` is ``False`` but there is no
            ``sequence_chunk`` ancestor type.
            GFF3MissingSequenceNameError: If there are no sequence names associated with this transcript.
        """

        if not self.sequence_name:
            raise GFF3MissingSequenceNameError("Must have sequence names to export to GFF3.")

        if not chromosome_relative_coordinates and not self.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            raise NoSuchAncestorException(
                "Cannot export GFF in relative coordinates without a sequence_chunk ancestor."
            )

        qualifiers = self.export_qualifiers(parent_qualifiers)

        tx_guid = str(self.guid)

        attributes = GFFAttributes(
            id=tx_guid,
            qualifiers=qualifiers,
            name=self.transcript_symbol,
            parent=parent,
            raise_on_reserved_attributes=raise_on_reserved_attributes,
        )

        # transcript feature
        row = GFFRow(
            self.sequence_name,
            GFF_SOURCE,
            BioCantorFeatureTypes.TRANSCRIPT,
            (self.start if chromosome_relative_coordinates else self.chunk_relative_start) + 1,
            self.end if chromosome_relative_coordinates else self.chunk_relative_end,
            NULL_COLUMN,
            self.strand,
            CDSPhase.NONE,
            attributes,
        )
        yield row

        # start adding exon features
        # re-use qualifiers, updating ID each time
        if chromosome_relative_coordinates:
            blocks = zip(self._genomic_starts, self._genomic_ends)
        else:
            blocks = [[x.start, x.end] for x in self.relative_blocks]

        for i, (start, end) in enumerate(blocks, 1):
            attributes = GFFAttributes(
                id=f"exon-{tx_guid}-{i}",
                qualifiers=qualifiers,
                name=self.transcript_symbol,
                parent=tx_guid,
                raise_on_reserved_attributes=raise_on_reserved_attributes,
            )
            row = GFFRow(
                self.sequence_name,
                GFF_SOURCE,
                BioCantorFeatureTypes.EXON,
                start + 1,
                end,
                NULL_COLUMN,
                self.strand,
                CDSPhase.NONE,
                attributes,
            )
            yield row

        if self.cds:
            yield from self.cds.to_gff(
                chromosome_relative_coordinates=chromosome_relative_coordinates,
                parent_qualifiers=qualifiers,
                parent=tx_guid,
                raise_on_reserved_attributes=raise_on_reserved_attributes,
            )
示例#10
0
    def liftover_location_to_seq_chunk_parent(
        location: Location,
        parent_or_seq_chunk_parent: Optional[Parent] = None,
    ) -> Location:
        """
        BioCantor supports constructing any of the interval classes from a subset of the chromosome. In order to
        be able to set up the coordinate relationship and successfully pull down sequence, this function
        lifts the coordinates from the original annotation object on to this new coordinate system.

        .. code:: python

            parent_1_15 = Parent(
                sequence=Sequence(
                    genome2[1:15],
                    Alphabet.NT_EXTENDED_GAPPED,
                    type=SequenceType.SEQUENCE_CHUNK,
                    parent=Parent(
                        location=SingleInterval(1, 15, Strand.PLUS,
                                               parent=Parent(id="genome_1_15", sequence_type=SequenceType.CHROMOSOME))
                    ),
                )
            )

        Alternatively, if the sequence is coming straight from a file, it will be a :class:`Parent` with a
        :class:`Sequence` attached:

        .. code:: python

            parent = Parent(id="chr1", sequence=Sequence(genome, Alphabet.NT_STRICT, type=SequenceType.CHROMOSOME))

        This convenience function detects which kind of parent is given, and sets up the appropriate location.

        This function also handles the case where the ``location`` argument is already chunk-relative. If this is the
        case, the ``location`` object is first lifted back to its chromosomal coordinates, then lifted back down
        on to this new chunk.

        Args:
            location: A location object, likely produced by :meth:`initialize_location()`. Could also be the location
                of an existing AbstractInterval subclass, such as when the method
                ``liftover_interval_to_parent_or_seq_chunk_parent()`` is called.
            parent_or_seq_chunk_parent: An optional parent, either as a full chromosome or as a sequence chunk. If
                not provided, this function is a no-op.

        Returns:
            A :class:`Location` object.

        Raises:
            ValidationException: If ``parent_or_seq_chunk_parent`` has no ancestor of type ``chromosome`` or
                ``sequence_chunk``.
            NullSequenceException: If ``parent_or_seq_chunk_parent`` has no usable sequence ancestor.
            NoSuchAncestorException: If ``location`` has a ``sequence_chunk`` ancestor, but no ``chromosome`` ancestor.
                Such a relationship is required to lift from one chunk to a new chunk.
        """
        if parent_or_seq_chunk_parent is None:
            return location

        # if we are already a subset, we need to first lift back to genomic coordinates before lifting to this chunk
        if location.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            if not location.has_ancestor_of_type(SequenceType.CHROMOSOME):
                raise NoSuchAncestorException(
                    "This location does not have a chromosome ancestor of its sequence chunk, "
                    "which means it is not possible to lift to a new a chunk through the chromosome coordinates."
                )
            # ensure that both chromosomes are the same chromosome
            loc_chrom = location.first_ancestor_of_type(SequenceType.CHROMOSOME)
            par_chrom = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.CHROMOSOME)
            if loc_chrom.sequence and par_chrom.sequence:
                ObjectValidation.require_parents_equal_except_location(loc_chrom, par_chrom)
            else:
                ObjectValidation.require_parents_equal_except_location_and_sequence(loc_chrom, par_chrom)

            location = location.lift_over_to_first_ancestor_of_type(SequenceType.CHROMOSOME).reset_parent(
                parent_or_seq_chunk_parent.parent
            )

        if parent_or_seq_chunk_parent.has_ancestor_of_type(SequenceType.SEQUENCE_CHUNK):
            if not parent_or_seq_chunk_parent.has_ancestor_of_type(SequenceType.CHROMOSOME):
                raise NoSuchAncestorException(
                    "Must have a chromosome in the hierarchy if a sequence chunk is provided."
                )

            chunk_parent = parent_or_seq_chunk_parent.first_ancestor_of_type(SequenceType.SEQUENCE_CHUNK)
            if not chunk_parent.sequence:
                raise NullSequenceException("Must have a sequence if a sequence chunk parent is provided.")

            location = location.reset_parent(chunk_parent.parent)
            sequence_chunk = chunk_parent.sequence
            # do not optimize blocks here -- this retains adjacent CDS intervals
            try:
                interval_location_rel_to_chunk = sequence_chunk.location_on_parent.parent_to_relative_location(
                    location, optimize_blocks=False
                )
            except LocationOverlapException:
                # the positions associated with this Location do not overlap the sequence chunk. However,
                # the chromosome location information can still be retained, but there is inherently no sequence
                # information.
                return EmptyLocation()
            interval_rel_to_chunk = interval_location_rel_to_chunk.reset_parent(parent_or_seq_chunk_parent)
            return interval_rel_to_chunk

        # since this is a whole genome (or something unknown), we don't need to lift anything up
        return location.reset_parent(parent_or_seq_chunk_parent)