예제 #1
0
    def test_to_gff(self, test_data_dir, tmp_path):
        """Parse GFF, write to disk, parse, compare"""
        gff = test_data_dir / "INSC1006_chrI.gff3"
        parsed = list(parse_standard_gff3(gff))
        a = [x.to_annotation_collection() for x in parsed]

        tmp_gff = tmp_path / "tmp.gff"
        with open(tmp_gff, "w") as fh:
            collection_to_gff3(a, fh)
        gff2 = list(parse_standard_gff3(tmp_gff))
        a2 = [x.to_annotation_collection() for x in gff2]

        for c1, c2 in zip(a, a2):
            assert c1.to_dict() == c2.to_dict()
예제 #2
0
 def test_parse_insc1006(self, test_data_dir):
     """INSC1006_chrI is a 4-gene manually built file from INSC1006. It uses the default naive parser."""
     gff = test_data_dir / "INSC1006_chrI.gff3"
     recs = list(parse_standard_gff3(gff))
     c = recs[0].annotation
     with open(test_data_dir / "INSC1006_chrI.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
예제 #3
0
 def test_parse_peg10(self, test_data_dir):
     """
     PEG10 is a gene with a -1 frameshift in one isoform, that is parsed using the RefSeq parser.
     """
     gff = test_data_dir / "PEG10_minus1frameshift.gff3"
     rec = list(parse_standard_gff3(gff))[0]
     with open(test_data_dir / "PEG10_minus1frameshift.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
예제 #4
0
 def test_parse_non_gene_locus_tag(self, test_data_dir):
     """Ran into bug in which non-gene feature with locus tag led to error, ensure handled correctly. Using cases
     from S288C gff3 that surfaced the issue."""
     gff = test_data_dir / "feature_test_non_gene_locus_tag.gff"
     recs = list(parse_standard_gff3(gff))
     features = list(recs[0].annotation.to_annotation_collection())
     assert features[0].to_dict()["locus_tag"] is None
     assert features[1].to_dict()["locus_tag"] == "YFL007W"
예제 #5
0
 def test_transitive(self, test_data_dir):
     """Test transitive loops"""
     for gff in ["SGCE.gff3", "FRG2B.gff3", "PEG10_minus1frameshift.gff3", "INSC1006_chrI.gff3"]:
         gff = test_data_dir / gff
         recs = list(parse_standard_gff3(gff))
         c = recs[0].to_annotation_collection()
         assert (
             AnnotationCollectionModel.Schema().load(c.to_dict()).to_annotation_collection().to_dict() == c.to_dict()
         )
예제 #6
0
 def test_transcript_inference(self, test_data_dir):
     recs = list(parse_standard_gff3(test_data_dir / "feature_test_1.gff"))
     c = recs[0].annotation.to_annotation_collection()
     # 4 total genes
     assert len(c.genes) == 4
     # two different types of pseudogene transcripts were inferred, one without exons and one with exons
     # one gene with an invalid biotype who was set to None
     assert len([x for x in c.genes if x.gene_type == Biotype.pseudogene]) == 2
     assert len([x for x in c.genes if x.gene_type == Biotype.lncRNA]) == 1
     invalid_biotype = [x for x in c.genes if not x.gene_type]
     assert len(invalid_biotype) == 1
     assert list(invalid_biotype[0].qualifiers["provided_biotype"])[0] == "invalid"
예제 #7
0
    def test_gff3_strand_error(self, tmp_path):
        tmp_gff3 = tmp_path / "tmp.gff3"

        annot = self.annot.to_annotation_collection().to_dict()
        annot["feature_collections"][0]["feature_intervals"][0][
            "strand"] = "MINUS"
        annot = AnnotationCollectionModel.Schema().load(
            annot).to_annotation_collection()

        with open(tmp_gff3, "w") as fh:
            collection_to_gff3([annot], fh)
        with pytest.raises(GFF3ChildParentMismatchError):
            _ = list(parse_standard_gff3(tmp_gff3))
예제 #8
0
 def test_parse_sgce(self, test_data_dir):
     """
     SGCE is an example of a protein coding gene with multiple isoforms, that is parsed using the RefSeq parser.
     """
     gff = test_data_dir / "SGCE.gff3"
     recs = list(parse_standard_gff3(gff))
     assert len(recs) == 1
     rec = recs[0]
     genes = rec.annotation.genes
     assert len(genes) == 1
     gene = genes[0]
     txs = gene.transcripts
     assert len(txs) == 22
     with open(test_data_dir / "SGCE.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
예제 #9
0
 def test_parse_frg2b(self, test_data_dir):
     """
     FRG2B is an example of a standard coding gene.
     """
     gff = test_data_dir / "FRG2B.gff3"
     recs = list(parse_standard_gff3(gff))
     assert len(recs) == 1
     rec = recs[0]
     genes = rec.annotation.genes
     assert len(genes) == 1
     assert rec.annotation.feature_collections is None
     gene = genes[0]
     txs = gene.transcripts
     assert len(txs) == 1
     tx = txs[0]
     assert tx.__dict__ == {
         "exon_starts": [133623894, 133625921, 133626228, 133626564],
         "exon_ends": [133625604, 133625999, 133626303, 133626795],
         "strand": Strand.MINUS,
         "cds_starts": [133625098, 133625921, 133626228, 133626564],
         "cds_ends": [133625604, 133625999, 133626303, 133626742],
         "cds_frames": [CDSFrame.ONE, CDSFrame.ONE, CDSFrame.ONE, CDSFrame.ZERO],
         "qualifiers": {
             "Dbxref": ["Ensembl:ENST00000425520.2", "Genbank:NM_001080998.2", "GeneID:441581", "HGNC:HGNC:33518"],
             "gbkey": ["mRNA"],
             "gene": ["FRG2B"],
             "tag": ["MANE Select"],
         },
         "is_primary_tx": False,
         "transcript_id": "NM_001080998.2",
         "protein_id": "NP_001074467.1",
         "product": "protein FRG2-like-1",
         "transcript_symbol": None,
         "transcript_type": Biotype.protein_coding,
         "sequence_name": "NC_000010.11",
         "sequence_guid": None,
         "transcript_interval_guid": None,
         "transcript_guid": None,
     }
예제 #10
0
 def test_gff3_locus_tag_error(self, test_data_dir):
     with pytest.raises(GFF3LocusTagError):
         _ = list(
             parse_standard_gff3(test_data_dir /
                                 "feature_test_locus_tag_error.gff"))
예제 #11
0
 def test_direct_cds_exon(self, test_data_dir):
     recs = list(parse_standard_gff3(test_data_dir / "gene_cds_direct_child.gff3"))
     c = recs[0].annotation
     with open(test_data_dir / "gene_cds_direct_child.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
예제 #12
0
 def test_parse_feature_tests(self, test_data_dir, gff3, json_file):
     recs = list(parse_standard_gff3(test_data_dir / gff3))
     c = recs[0].annotation
     with open(test_data_dir / json_file) as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c