def test_parse_insc1006(self, test_data_dir): """INSC1006_chrI is a 4-gene manually built file from INSC1006. It uses the default naive parser.""" gff = test_data_dir / "INSC1006_chrI.gff3" recs = list(parse_standard_gff3(gff)) c = recs[0].annotation with open(test_data_dir / "INSC1006_chrI.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
def default_parse_func(db: FeatureDB, chroms: List[str]) -> Iterable[AnnotationCollectionModel]: """ This is the default parser function. Mappings include: gene_id -> gene_id gene_name or if missing gene_symbol -> gene_symbol gene_biotype or if missing gene_type -> gene_biotype transcript_id -> transcript_id transcript_name or if missing transcript_name -> transcript_symbol transcript_biotype or if missing transcript_type -> transcript_biotype if no transcript_biotype or transcript_type, then gene result is used A list of chromosomes is required in order to allow there to be a specified order of data, otherwise they come back unordered from the database. Args: db: Database from :mod:`gffutils`. chroms: List of sequence names to iterate over. Yields: :class:`~biocantor.io.models.AnnotationCollectionModel` """ non_gene_feature_types = _find_non_gene_feature_types(db) for chrom in chroms: parsed_genes = _parse_genes(chrom, db) if non_gene_feature_types: parsed_features = _parse_features(chrom, db, non_gene_feature_types) else: parsed_features = None annot = AnnotationCollectionModel.Schema().load( dict(genes=parsed_genes, feature_collections=parsed_features, sequence_name=chrom) ) yield annot
def test_parse_peg10(self, test_data_dir): """ PEG10 is a gene with a -1 frameshift in one isoform, that is parsed using the RefSeq parser. """ gff = test_data_dir / "PEG10_minus1frameshift.gff3" rec = list(parse_standard_gff3(gff))[0] with open(test_data_dir / "PEG10_minus1frameshift.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
def test_transitive(self, test_data_dir): """Test transitive loops""" for gff in ["SGCE.gff3", "FRG2B.gff3", "PEG10_minus1frameshift.gff3", "INSC1006_chrI.gff3"]: gff = test_data_dir / gff recs = list(parse_standard_gff3(gff)) c = recs[0].to_annotation_collection() assert ( AnnotationCollectionModel.Schema().load(c.to_dict()).to_annotation_collection().to_dict() == c.to_dict() )
def test_gff3_strand_error(self, tmp_path): tmp_gff3 = tmp_path / "tmp.gff3" annot = self.annot.to_annotation_collection().to_dict() annot["feature_collections"][0]["feature_intervals"][0][ "strand"] = "MINUS" annot = AnnotationCollectionModel.Schema().load( annot).to_annotation_collection() with open(tmp_gff3, "w") as fh: collection_to_gff3([annot], fh) with pytest.raises(GFF3ChildParentMismatchError): _ = list(parse_standard_gff3(tmp_gff3))
def test_parse_sgce(self, test_data_dir): """ SGCE is an example of a protein coding gene with multiple isoforms, that is parsed using the RefSeq parser. """ gff = test_data_dir / "SGCE.gff3" recs = list(parse_standard_gff3(gff)) assert len(recs) == 1 rec = recs[0] genes = rec.annotation.genes assert len(genes) == 1 gene = genes[0] txs = gene.transcripts assert len(txs) == 22 with open(test_data_dir / "SGCE.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
def _produce_empty_records( seqrecords_dict: Dict[str, SeqRecord], seen_seqs: Set[str] ) -> Iterable[ParsedAnnotationRecord]: """ Convenience function shared by :meth:`parse_gff3_embedded_fasta()` and :meth:`parse_gff3_fasta()` that appends empty ``ParsedAnnotationRecord`` objects to the end. This ensures that every sequence in the FASTA is still represented in the final object set, even if it has zero annotations. Args: seqrecords_dict: Dictionary mapping sequence names to SeqRecord objects. seen_seqs: Set of sequences that were found when parsing the GFF3. Yields: Iterable of ``ParsedAnnotationRecord`` objects with empty annotations. """ for sequence_name in seqrecords_dict.keys() - seen_seqs: seqrecord = seqrecords_dict[sequence_name] annot = AnnotationCollectionModel.Schema().load(dict(sequence_name=seqrecord.id, start=0, end=len(seqrecord))) yield ParsedAnnotationRecord(annotation=annot, seqrecord=seqrecord)
def group_gene_records_by_locus_tag( record_iter: Iterator[SeqRecord], parse_func: Callable[[GeneFeature], Dict[str, Any]], feature_parse_func: Callable[[FeatureIntervalGenBankCollection], Dict[str, Any]], genbank_parser_type: GenBankParserType = GenBankParserType.LOCUS_TAG, ) -> Iterator[ParsedAnnotationRecord]: """Model 2: ``locus_tag`` defined GenBank. All feature types that qualify within the hierarchical structure, possess a locus_tag, and whose feature type are valid for a known transcribed interval type, will be included in the gene parsing. All other feature types will become generic features (FeatureIntervals), unless we are in hybrid mode. In hybrid mode, locus_tag is used first, then all of the remaining features are sent to the sorted parser. Args: record_iter: Iterator of SeqRecord objects. parse_func: Optional parse function implementation. feature_parse_func: Optional feature interval parse function implementation. genbank_parser_type: Optional parser type. Changing this to GenBankParserType.HYBRID will enable hybrid parsing mode. Yields: :class:`ParsedAnnotationRecord`. """ if genbank_parser_type not in [ GenBankParserType.LOCUS_TAG, GenBankParserType.HYBRID ]: raise GenBankParserError("Must use either locus_tag or hybrid") tot_genes = 0 tot_features = 0 for seqrecord in record_iter: gene_filtered_features = [] remaining_features = [] source = None for f in seqrecord.features: if f.type in GENBANK_GENE_FEATURES and KnownQualifiers.LOCUS_TAG.value in f.qualifiers: gene_filtered_features.append(f) elif f.type == MetadataFeatures.SOURCE.value: source = f else: remaining_features.append(f) sorted_gene_filtered_features = sorted( gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value]) genes = [] for locus_tag, gene_features in itertools.groupby( sorted_gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value][0 ]): # sort the features for this locus tag to bubble the "gene" feature to the top, if it exists gene_features = sorted( gene_features, key=lambda f: f.type != GeneFeatures.GENE.value) # do we have more than one gene with this locus_tag? if len(gene_features ) > 1 and gene_features[1].type == GeneFeatures.GENE.value: raise GenBankLocusTagError( f"Grouping by locus tag found multiple gene features with the same locus tag:" f"\n{gene_features[0]}\n{gene_features[1]}") gene_feature = gene_features[0] if gene_feature.type == GeneFeatures.GENE.value: gene = _construct_gene_from_feature(gene_feature, seqrecord, GeneFeature) else: gene = _construct_gene_from_feature( gene_feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if not gene: continue for feature in gene_features[1:]: if feature.type in TranscriptFeature.types: gene.add_child(feature) elif feature.type in IntervalFeature.types: if len(gene.children) == 0: gene.add_child(feature) else: gene.children[-1].add_child(feature) if gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) if source is not None: source_qualifiers = source.qualifiers else: source_qualifiers = None if genbank_parser_type == GenBankParserType.LOCUS_TAG: feature_collections = _extract_generic_features( seqrecord, remaining_features, feature_parse_func) else: # hybrid parsing mode tmp_seqrecord = deepcopy(seqrecord) tmp_seqrecord.features = remaining_features tmp_annotation = next( group_gene_records_from_sorted_genbank( (tmp_seqrecord, ), parse_func, feature_parse_func)) if tmp_annotation.annotation.feature_collections: feature_collections = [ FeatureIntervalCollectionModel.Schema().dump(x) for x in tmp_annotation.annotation.feature_collections ] else: feature_collections = None if tmp_annotation.annotation.genes: genes.extend([ GeneIntervalModel.Schema().dump(x) for x in tmp_annotation.annotation.genes ]) tot_features += len(feature_collections) if feature_collections else 0 tot_genes += len(genes) if genes else 0 annotation = AnnotationCollectionModel.Schema().load( dict( genes=genes, feature_collections=feature_collections, name=seqrecord.id, sequence_name=seqrecord.id, start=0, end=len(seqrecord), qualifiers=source_qualifiers, )) yield ParsedAnnotationRecord(annotation=annotation, seqrecord=seqrecord) if tot_genes + tot_features == 0: raise EmptyGenBankError( "GenBank parsing produced zero genes and zero features.")
def group_gene_records_from_sorted_genbank( record_iter: Iterator[SeqRecord], parse_func: Callable[[GeneFeature], Dict[str, Any]], feature_parse_func: Callable[[FeatureIntervalGenBankCollection], Dict[str, Any]], ) -> Iterator[ParsedAnnotationRecord]: """Model 1: position sorted GenBank. This function looks for canonical gene records: gene -> Optional(mRNA) -> CDS records It also looks for canonical non-coding records: gene -> {misc_RNA,tRNA,rRNA,etc) It also will infer non-canonical record types, including non-coding transcripts and coding genes from isolated CDS/non-coding features (those without a gene feature before them in the sort order). Any features that do not fit the above bins are interpreted as generic features. Some GenBank files are improperly ordered, and will have things like the CDS feature first, or the mRNA feature first. To try and capture this, the full set of records are sorted first by position, then in the order: gene mRNA CDS exon anything else Args: record_iter: Iterator of SeqRecord objects. parse_func: Optional parse function implementation. feature_parse_func: Optional feature interval parse function implementation. Yields: :class:`ParsedAnnotationRecord`. """ tot_genes = 0 tot_features = 0 for seqrecord in record_iter: gene = None source = None genes = [] # capture non-gene intervals downstream feature_features = [] # sort features to try to capture weirdly ordered genbank files sorted_features = sorted( seqrecord.features, key=lambda x: ( x.location.nofuzzy_start, x.type != GeneFeatures.GENE.value, x.type != TranscriptFeatures.CODING_TRANSCRIPT.value, x.type != GeneIntervalFeatures.CDS.value, x.type != GeneIntervalFeatures.EXON.value, ), ) for feature in sorted_features: # try to capture the Source field, if it exists if feature.type == MetadataFeatures.SOURCE.value: source = feature # base case for start; iterate until we find a gene elif gene is None: if feature.type in GeneFeature.types: gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature) # gene is None if it was not parseable if not gene: continue # base case for starting with a isolated ncRNA or CDS feature; immediately add them # and reset the gene to None elif feature.type in TranscriptFeature.types or feature.type in IntervalFeature.types: gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if gene: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = None # this must be a generic feature else: feature_features.append(feature) # next gene; re-set the gene object and report out the collection elif feature.type in GeneFeature.types: if gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature) if not gene: continue elif feature.type in TranscriptFeature.types: # if the current gene is non-empty, and the feature is not a mRNA, then this is a isolated ncRNA # finish this gene and start a new one if feature.type != TranscriptFeatures.CODING_TRANSCRIPT and gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if not gene: continue else: gene.add_child(feature) elif feature.type in IntervalFeature.types: if not gene.has_children: gene.add_child(feature) else: gene.children[-1].add_child(feature) else: feature_features.append(feature) # gene could be None if this record has no annotations if gene is not None and gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) if source is not None: source_qualifiers = source.qualifiers else: source_qualifiers = None feature_collections = _extract_generic_features( seqrecord, feature_features, feature_parse_func) tot_features += len(feature_collections) if feature_collections else 0 tot_genes += len(genes) if genes else 0 annotation = AnnotationCollectionModel.Schema().load( dict( genes=genes, feature_collections=feature_collections, sequence_name=seqrecord.id, start=0, end=len(seqrecord), qualifiers=source_qualifiers, )) yield ParsedAnnotationRecord(annotation=annotation, seqrecord=seqrecord) if tot_genes + tot_features == 0: raise EmptyGenBankError( "GenBank parsing produced zero genes and zero features.")
class TestFeatures: feat1 = dict( interval_starts=[2], interval_ends=[5], strand=Strand.PLUS.name, feature_types=["a", "b"], feature_name="feat1", sequence_name="chr1", ) feat2 = dict( interval_starts=[2, 7, 12], interval_ends=[6, 10, 15], strand=Strand.PLUS.name, feature_types=["b"], feature_name="feat2", sequence_name="chr1", ) featcollection = dict( feature_intervals=[feat1, feat2], feature_collection_name="featgrp1", feature_collection_id="abc123", feature_collection_type="group", sequence_name="chr1", ) tx1 = dict( exon_starts=[2], exon_ends=[18], strand=Strand.PLUS.name, cds_starts=[5], cds_ends=[9], cds_frames=[CDSFrame.ZERO.name], sequence_name="chr1", transcript_symbol="tx1", ) tx2 = dict( exon_starts=[2, 7, 12], exon_ends=[6, 10, 15], strand=Strand.PLUS.name, cds_starts=[4, 7, 12], cds_ends=[6, 10, 13], cds_frames=[CDSFrame.ZERO.name, CDSFrame.TWO.name, CDSFrame.TWO.name], sequence_name="chr1", transcript_symbol="tx2", ) gene = dict( transcripts=[tx1, tx2], qualifiers={}, gene_type=Biotype.protein_coding.name, gene_id="gene1", sequence_name="chr1", ) annot = AnnotationCollectionModel.Schema().load( dict(feature_collections=[featcollection], genes=[gene], sequence_name="chr1")) def test_gff3_export(self, test_data_dir): """Test that the GFF3 export did not change over time""" gff = test_data_dir / "feature_test_2.gff" lines = [x.rstrip() for x in open(gff) if not x.startswith("#")] assert lines == [ str(x) for x in self.annot.to_annotation_collection().to_gff() ] def test_gff3_locus_tag_error(self, test_data_dir): with pytest.raises(GFF3LocusTagError): _ = list( parse_standard_gff3(test_data_dir / "feature_test_locus_tag_error.gff")) def test_gff3_strand_error(self, tmp_path): tmp_gff3 = tmp_path / "tmp.gff3" annot = self.annot.to_annotation_collection().to_dict() annot["feature_collections"][0]["feature_intervals"][0][ "strand"] = "MINUS" annot = AnnotationCollectionModel.Schema().load( annot).to_annotation_collection() with open(tmp_gff3, "w") as fh: collection_to_gff3([annot], fh) with pytest.raises(GFF3ChildParentMismatchError): _ = list(parse_standard_gff3(tmp_gff3)) def test_gff3_chromosome(self, tmp_path): tmp_gff3 = tmp_path / "tmp.gff3" annot = self.annot.to_annotation_collection().to_dict() annot["feature_collections"][0]["feature_intervals"][0][ "sequence_name"] = "chr2" annot = AnnotationCollectionModel.Schema().load( annot).to_annotation_collection() with open(tmp_gff3, "w") as fh: collection_to_gff3([annot], fh) with pytest.raises(GFF3ChildParentMismatchError): _ = list(parse_standard_gff3(tmp_gff3))
def test_direct_cds_exon(self, test_data_dir): recs = list(parse_standard_gff3(test_data_dir / "gene_cds_direct_child.gff3")) c = recs[0].annotation with open(test_data_dir / "gene_cds_direct_child.json") as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
def test_parse_feature_tests(self, test_data_dir, gff3, json_file): recs = list(parse_standard_gff3(test_data_dir / gff3)) c = recs[0].annotation with open(test_data_dir / json_file) as fh: assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c