def test_to_gff(self, test_data_dir, tmp_path): """Parse GFF, write to disk, parse, compare""" gff = test_data_dir / "INSC1006_chrI.gff3" parsed = list(parse_standard_gff3(gff)) a = [x.to_annotation_collection() for x in parsed] tmp_gff = tmp_path / "tmp.gff" with open(tmp_gff, "w") as fh: collection_to_gff3(a, fh) gff2 = list(parse_standard_gff3(tmp_gff)) a2 = [x.to_annotation_collection() for x in gff2] for c1, c2 in zip(a, a2): assert c1.to_dict() == c2.to_dict()
def test_parse_insc1006(self, test_data_dir): """INSC1006_chrI is a 4-gene manually built file from INSC1006. It uses the default naive parser.""" gff = test_data_dir / "INSC1006_chrI.gff3" recs = list(parse_standard_gff3(gff)) c = recs[0].annotation with open(test_data_dir / "INSC1006_chrI.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
def test_parse_peg10(self, test_data_dir): """ PEG10 is a gene with a -1 frameshift in one isoform, that is parsed using the RefSeq parser. """ gff = test_data_dir / "PEG10_minus1frameshift.gff3" rec = list(parse_standard_gff3(gff))[0] with open(test_data_dir / "PEG10_minus1frameshift.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
def test_parse_non_gene_locus_tag(self, test_data_dir): """Ran into bug in which non-gene feature with locus tag led to error, ensure handled correctly. Using cases from S288C gff3 that surfaced the issue.""" gff = test_data_dir / "feature_test_non_gene_locus_tag.gff" recs = list(parse_standard_gff3(gff)) features = list(recs[0].annotation.to_annotation_collection()) assert features[0].to_dict()["locus_tag"] is None assert features[1].to_dict()["locus_tag"] == "YFL007W"
def test_transitive(self, test_data_dir): """Test transitive loops""" for gff in ["SGCE.gff3", "FRG2B.gff3", "PEG10_minus1frameshift.gff3", "INSC1006_chrI.gff3"]: gff = test_data_dir / gff recs = list(parse_standard_gff3(gff)) c = recs[0].to_annotation_collection() assert ( AnnotationCollectionModel.Schema().load(c.to_dict()).to_annotation_collection().to_dict() == c.to_dict() )
def test_transcript_inference(self, test_data_dir): recs = list(parse_standard_gff3(test_data_dir / "feature_test_1.gff")) c = recs[0].annotation.to_annotation_collection() # 4 total genes assert len(c.genes) == 4 # two different types of pseudogene transcripts were inferred, one without exons and one with exons # one gene with an invalid biotype who was set to None assert len([x for x in c.genes if x.gene_type == Biotype.pseudogene]) == 2 assert len([x for x in c.genes if x.gene_type == Biotype.lncRNA]) == 1 invalid_biotype = [x for x in c.genes if not x.gene_type] assert len(invalid_biotype) == 1 assert list(invalid_biotype[0].qualifiers["provided_biotype"])[0] == "invalid"
def test_gff3_strand_error(self, tmp_path): tmp_gff3 = tmp_path / "tmp.gff3" annot = self.annot.to_annotation_collection().to_dict() annot["feature_collections"][0]["feature_intervals"][0][ "strand"] = "MINUS" annot = AnnotationCollectionModel.Schema().load( annot).to_annotation_collection() with open(tmp_gff3, "w") as fh: collection_to_gff3([annot], fh) with pytest.raises(GFF3ChildParentMismatchError): _ = list(parse_standard_gff3(tmp_gff3))
def test_parse_sgce(self, test_data_dir): """ SGCE is an example of a protein coding gene with multiple isoforms, that is parsed using the RefSeq parser. """ gff = test_data_dir / "SGCE.gff3" recs = list(parse_standard_gff3(gff)) assert len(recs) == 1 rec = recs[0] genes = rec.annotation.genes assert len(genes) == 1 gene = genes[0] txs = gene.transcripts assert len(txs) == 22 with open(test_data_dir / "SGCE.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
def test_parse_frg2b(self, test_data_dir): """ FRG2B is an example of a standard coding gene. """ gff = test_data_dir / "FRG2B.gff3" recs = list(parse_standard_gff3(gff)) assert len(recs) == 1 rec = recs[0] genes = rec.annotation.genes assert len(genes) == 1 assert rec.annotation.feature_collections is None gene = genes[0] txs = gene.transcripts assert len(txs) == 1 tx = txs[0] assert tx.__dict__ == { "exon_starts": [133623894, 133625921, 133626228, 133626564], "exon_ends": [133625604, 133625999, 133626303, 133626795], "strand": Strand.MINUS, "cds_starts": [133625098, 133625921, 133626228, 133626564], "cds_ends": [133625604, 133625999, 133626303, 133626742], "cds_frames": [CDSFrame.ONE, CDSFrame.ONE, CDSFrame.ONE, CDSFrame.ZERO], "qualifiers": { "Dbxref": ["Ensembl:ENST00000425520.2", "Genbank:NM_001080998.2", "GeneID:441581", "HGNC:HGNC:33518"], "gbkey": ["mRNA"], "gene": ["FRG2B"], "tag": ["MANE Select"], }, "is_primary_tx": False, "transcript_id": "NM_001080998.2", "protein_id": "NP_001074467.1", "product": "protein FRG2-like-1", "transcript_symbol": None, "transcript_type": Biotype.protein_coding, "sequence_name": "NC_000010.11", "sequence_guid": None, "transcript_interval_guid": None, "transcript_guid": None, }
def test_gff3_locus_tag_error(self, test_data_dir): with pytest.raises(GFF3LocusTagError): _ = list( parse_standard_gff3(test_data_dir / "feature_test_locus_tag_error.gff"))
def test_direct_cds_exon(self, test_data_dir): recs = list(parse_standard_gff3(test_data_dir / "gene_cds_direct_child.gff3")) c = recs[0].annotation with open(test_data_dir / "gene_cds_direct_child.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
def test_parse_feature_tests(self, test_data_dir, gff3, json_file): recs = list(parse_standard_gff3(test_data_dir / gff3)) c = recs[0].annotation with open(test_data_dir / json_file) as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c