예제 #1
0
    def __init__(
        self,
        cds_starts: List[int],
        cds_ends: List[int],
        strand: Strand,
        frames_or_phases: List[Union[CDSFrame, CDSPhase]],
        sequence_guid: Optional[UUID] = None,
        sequence_name: Optional[str] = None,
        protein_id: Optional[str] = None,
        product: Optional[str] = None,
        qualifiers: Optional[Dict[Hashable, QualifierValue]] = None,
        guid: Optional[UUID] = None,
        parent_or_seq_chunk_parent: Optional[Parent] = None,
    ):

        self._location = self.initialize_location(cds_starts, cds_ends, strand,
                                                  parent_or_seq_chunk_parent)
        self._genomic_starts = cds_starts
        self._genomic_ends = cds_ends
        self.start = cds_starts[0]
        self.end = cds_ends[-1]
        self._strand = strand
        self._parent_or_seq_chunk_parent = parent_or_seq_chunk_parent
        self.sequence_guid = sequence_guid
        self.sequence_name = sequence_name
        self.product = product
        self.protein_id = protein_id
        self._import_qualifiers_from_list(qualifiers)

        if len(frames_or_phases) != len(self._genomic_starts):
            raise MismatchedFrameException(
                "Number of frame or phase entries must match number of exons")

        if len(self.chromosome_location) == 0:
            raise InvalidCDSIntervalError("Cannot have an empty CDS interval")

        # only allow either all CDSFrame or all CDSPhase
        is_frame = isinstance(frames_or_phases[0], CDSFrame)
        for frame_or_phase in frames_or_phases[1:]:
            if is_frame and isinstance(frame_or_phase, CDSPhase):
                raise MismatchedFrameException("Cannot mix frame and phase")
            elif not is_frame and isinstance(frame_or_phase, CDSFrame):
                raise MismatchedFrameException("Cannot mix frame and phase")

        if is_frame:
            self.frames = frames_or_phases
        else:
            self.frames = [x.to_frame() for x in frames_or_phases]

        if guid is None:
            self.guid = digest_object(
                self._genomic_starts,
                self._genomic_ends,
                self.frames,
                self.product,
                self.protein_id,
                self.qualifiers,
            )
        else:
            self.guid = guid
예제 #2
0
class TestBedWriter:
    tx1 = dict(
        exon_starts=[2],
        exon_ends=[18],
        strand=Strand.PLUS.name,
        cds_starts=[5],
        cds_ends=[9],
        cds_frames=[CDSFrame.ZERO.name],
        sequence_name="chr1",
    )
    tx2 = dict(
        exon_starts=[2, 7, 12],
        exon_ends=[6, 10, 15],
        strand=Strand.PLUS.name,
        cds_starts=[4, 7, 12],
        cds_ends=[6, 10, 13],
        cds_frames=[CDSFrame.ZERO.name, CDSFrame.TWO.name, CDSFrame.TWO.name],
        transcript_interval_guid=digest_object(123),
        transcript_symbol="name",
    )
    feat1 = dict(interval_starts=[2],
                 interval_ends=[5],
                 strand=Strand.PLUS.name,
                 sequence_name="chr10")
    feat2 = dict(interval_starts=[2, 7, 12],
                 interval_ends=[6, 10, 15],
                 strand=Strand.PLUS.name)
    feat3 = dict(interval_starts=[25],
                 interval_ends=[30],
                 strand=Strand.MINUS.name)

    @pytest.mark.parametrize(
        "tx,expected",
        [
            (tx1, [
                "chr1", "2", "18", "None", "0", "+", "5", "9", "0,0,0", "1",
                "16", "0"
            ]),
            (
                tx2,
                [
                    "None",
                    "2",
                    "15",
                    "name",
                    "0",
                    "+",
                    "4",
                    "13",
                    "0,0,0",
                    "3",
                    "4,3,3",
                    "0,5,10",
                ],
            ),
        ],
    )
    def test_tx(self, tx, expected):
        model = TranscriptIntervalModel.Schema().load(tx)
        obj = model.to_transcript_interval()
        assert str(obj.to_bed12()) == "\t".join(expected)

    @pytest.mark.parametrize(
        "feat,expected",
        [
            (feat1, [
                "chr10", "2", "5", "None", "0", "+", "0", "0", "0,0,0", "1",
                "3", "0"
            ]),
            (feat2, [
                "None", "2", "15", "None", "0", "+", "0", "0", "0,0,0", "3",
                "4,3,3", "0,5,10"
            ]),
            (feat3, [
                "None", "25", "30", "None", "0", "-", "0", "0", "0,0,0", "1",
                "5", "0"
            ]),
        ],
    )
    def test_feat(self, feat, expected):
        model = FeatureIntervalModel.Schema().load(feat)
        obj = model.to_feature_interval()
        assert str(obj.to_bed12()) == "\t".join(expected)

    @pytest.mark.parametrize(
        "tx,score,rgb,name,expected",
        [
            (
                tx2,
                10,
                RGB(128, 128, 128),
                "transcript_symbol",
                [
                    "None", "2", "15", "name", "10", "+", "4", "13",
                    "128,128,128", "3", "4,3,3", "0,5,10"
                ],
            ),
            (  # if name is not an attribute, just pass it along
                tx2,
                10,
                RGB(128, 128, 128),
                "test",
                [
                    "None", "2", "15", "test", "10", "+", "4", "13",
                    "128,128,128", "3", "4,3,3", "0,5,10"
                ],
            ),
        ],
    )
    def test_changed_metadata(self, tx, score, rgb, name, expected):
        model = TranscriptIntervalModel.Schema().load(tx)
        obj = model.to_transcript_interval()
        assert str(obj.to_bed12(score, rgb, name)) == "\t".join(expected)
예제 #3
0
    def __init__(
        self,
        exon_starts: List[int],
        exon_ends: List[int],
        strand: Strand,
        cds_starts: Optional[List[int]] = None,
        cds_ends: Optional[List[int]] = None,
        cds_frames: Optional[List[CDSFrame]] = None,
        qualifiers: Optional[Dict[Hashable, QualifierValue]] = None,
        is_primary_tx: Optional[bool] = None,
        transcript_id: Optional[str] = None,
        transcript_symbol: Optional[str] = None,
        transcript_type: Optional[Biotype] = None,
        sequence_guid: Optional[UUID] = None,
        sequence_name: Optional[str] = None,
        protein_id: Optional[str] = None,
        product: Optional[str] = None,
        guid: Optional[UUID] = None,
        transcript_guid: Optional[UUID] = None,
        parent_or_seq_chunk_parent: Optional[Parent] = None,
    ):
        self._location = TranscriptInterval.initialize_location(
            exon_starts,
            exon_ends,
            strand,
            parent_or_seq_chunk_parent=parent_or_seq_chunk_parent,
        )

        if cds_starts is not None and cds_ends is None:
            raise InvalidCDSIntervalError("If CDS start is defined, CDS end must be defined")
        elif cds_starts is None and cds_ends is not None:
            raise InvalidCDSIntervalError("If CDS end is defined, CDS start must be defined")
        elif cds_starts is not None and cds_ends is not None:  # must be coding
            if len(cds_starts) != len(cds_ends):
                raise InvalidCDSIntervalError("Number of CDS starts does not number of CDS ends")
            elif cds_starts[0] < exon_starts[0]:
                raise InvalidCDSIntervalError("CDS start must be greater than or equal to exon start")
            elif cds_ends[-1] > exon_ends[-1]:
                raise InvalidCDSIntervalError("CDS end must be less than or equal to than exon end")
            elif cds_frames is None:
                raise InvalidCDSIntervalError("If CDS interval is defined, CDS frames must be defined")
            elif len(cds_frames) != len(cds_starts):
                raise InvalidCDSIntervalError("Number of CDS frames must match number of CDS starts/ends")

            # as a result of a parent or seq chunk parent constructor, it may be the case that this CDS is entirely
            # sliced out. Check this case, and then void out the CDS.
            try:
                self.cds = CDSInterval(
                    cds_starts,
                    cds_ends,
                    strand,
                    cds_frames,
                    sequence_guid,
                    sequence_name,
                    protein_id,
                    product,
                    parent_or_seq_chunk_parent=parent_or_seq_chunk_parent,
                )
            except LocationOverlapException:
                self.cds = None

            self._cds_frames = cds_frames

        else:
            self.cds = self._cds_frames = self._cds_start = self._cds_end = None

        self._genomic_starts = exon_starts
        self._genomic_ends = exon_ends
        self._strand = strand
        self._parent_or_seq_chunk_parent = parent_or_seq_chunk_parent
        self.start = self.genomic_start = exon_starts[0]
        self.end = self.genomic_end = exon_ends[-1]

        self._is_primary_feature = is_primary_tx
        self.transcript_id = transcript_id
        self.transcript_symbol = transcript_symbol
        self.transcript_type = transcript_type
        self.protein_id = protein_id
        self.product = product
        self.sequence_guid = sequence_guid
        self.sequence_name = sequence_name
        self.bin = bins(self.start, self.end, fmt="bed")
        # qualifiers come in as a List, convert to Set
        self._import_qualifiers_from_list(qualifiers)

        if guid is None:
            self.guid = digest_object(
                self._genomic_starts,
                self._genomic_ends,
                self._cds_frames,
                self.qualifiers,
                self.transcript_id,
                self.transcript_symbol,
                self.transcript_type,
                self.protein_id,
                self.sequence_name,
                self.is_primary_tx,
                self.cds.guid if self.cds else None,
            )
        else:
            self.guid = guid
        self.transcript_guid = transcript_guid
예제 #4
0
def test_nested_kwargs(kwargs, str_rep, uuid):
    assert list(_encode_object_for_digest(**kwargs)) == str_rep
    assert digest_object(**kwargs) == uuid
예제 #5
0
def test_nested(val, str_rep, uuid):
    assert list(_encode_object_for_digest(val)) == str_rep
    assert digest_object(val) == uuid
예제 #6
0
def test_dicts_of_sets(val, str_rep, uuid):
    assert list(_encode_object_for_digest(val)) == str_rep
    assert digest_object(val) == uuid
예제 #7
0
def test_sets(val, str_rep, uuid):
    assert str_rep == list(_encode_object_for_digest(val))
    assert digest_object(val) == uuid
예제 #8
0
def test_same_values_different_keys(args, kwargs, str_rep, uuid):
    assert list(_encode_object_for_digest(*args, **kwargs)) == str_rep
    assert digest_object(*args, **kwargs) == uuid