def test_sequences(test_data_dir, tmp_path, gbk, flavor): """Parse a genbank, write it to disk, then parse it again and compare.""" gbk = test_data_dir / gbk with open(gbk, "r") as fh: collections = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, flavor) with open(tmp_gbk, "r") as fh: new_collection = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) assert len(collections[0].genes) == len(new_collection[0].genes) for gene_a, gene_b in zip(collections[0], new_collection[0]): assert gene_a._location == gene_b._location tx_a = gene_a.transcripts[0] tx_b = gene_b.transcripts[0] assert tx_a._location == tx_b._location if tx_a.is_coding: assert tx_a.cds._location == tx_b.cds._location assert tx_a.get_protein_sequence() == tx_b.get_protein_sequence() assert tx_a.get_transcript_sequence() == tx_b.get_transcript_sequence()
def test_missing_translation(test_data_dir, tmp_path): gbk = test_data_dir / "INSC1003_wrong_missing_translation.gbk" with open(gbk, "r") as fh: collections = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED))) tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, update_translations=False) with open(tmp_gbk, "r") as fh: annot = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0] genes = annot.genes assert "translation" not in genes[0].transcripts[0].qualifiers assert "translation" in genes[1].transcripts[0].qualifiers assert genes[1].transcripts[0].qualifiers["translation"] == { "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK" "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE" "NDIPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGF" "VHICRLDTAGARVLEN" } # now export to file and force the translations to be recalculated tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, update_translations=True) with open(tmp_gbk, "r") as fh: annot = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0] genes = annot.genes assert genes[0].transcripts[0].qualifiers["translation"] == { "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAA" "QPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLE" "STVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDAR" "LLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMK" "GMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRT" "LRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWL" "KNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHV" "VTPNKKANTSSMDYYHLLRHAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATT" "LAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVL" "RYVGNIDEDGACRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV*" } assert genes[1].transcripts[0].qualifiers["translation"] == { "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK" "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE" "NDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRE" "RLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN*" }
def test_parse_peg10(self, test_data_dir): """PEG10 is a human gene with a -1 frameshift""" gff3 = test_data_dir / "PEG10_offset_gff3_fasta.gff3" gff3_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_gff3_embedded_fasta(gff3)))[0] tx = gff3_rec.genes[0].transcripts[0] assert not tx.has_in_frame_stop
def test_broken_frameshift(self, test_data_dir): """If I merge the transcript, the frames list no longer matches the location and an exception is raised.""" gbk = test_data_dir / "insO_frameshift.gbk" with open(gbk, "r") as fh: gbk_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh)))[0] cds = gbk_rec.genes[0].get_primary_transcript().cds cds._location = cds._location.merge_overlapping() with pytest.raises(MismatchedFrameException): _ = cds.translate()
def test_parse_inso(self, test_data_dir): """This proves we handle frame and phase""" gbk = test_data_dir / "insO_frameshift.gbk" gff3 = test_data_dir / "insO_frameshift.gff3" with open(gbk, "r") as fh: gbk_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh)))[0] gff3_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_gff3_embedded_fasta(gff3)))[0] expected_protein = ( "MKKRNFSAEFKRESAQLVVDQKYTVADAAKAMDVGLSTMTRWVKQLRDERQGKTPKASPITPEQIEIRKLRKKLQRIEMENEILKKNRP" "EKPDGRRAVLRSQVLELHGISHGSAGARSIATMATRRGYQMGRWLAGRLMKELGLVSCQQPTHRYKRGGHEHVAIPNYLERQFAVTEPNQV" "WCGDVTYIWTGKRWAYLAVVLDLFARKPVGWAMSFSPDSRLTMKALEMAWETRGKPVGVMFQSDQGSHYTSRQFRQLLWRYRIRQSMSRR" "GNCWDNSPMERFFRSLKNEWVPATGYVSFSDAAHAITDYIVGYYSALRPHEYNGGLPPNESENRYWKNSNAEASFS*" ) assert (str(gbk_rec.genes[0].get_primary_protein()) == str( gff3_rec.genes[0].get_primary_protein()) == expected_protein)
def test_tbl_export_from_gff3(test_data_dir, tmp_path, gff3, expected_tbl): gff3 = test_data_dir / gff3 recs = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_gff3_embedded_fasta(gff3))) tmp = tmp_path / "tmp.tbl" with open(tmp, "w") as fh: collection_to_tbl(recs, fh, locus_tag_prefix="test", submitter_lab_name="inscripta", random_seed=123) with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2: assert fh1.read() == fh2.read()
def test_genbank_to_gff(self, test_data_dir, tmp_path, gbk, gff3, add_sequences): """ INSC1006_chrI.gff3 and INSC1003.gff3 were created from INSC1006_chrI.gbff and INSC1003.gbk respectively, so we can compare to the source file. """ gbk = test_data_dir / gbk with open(gbk, "r") as fh: parsed = list(ParsedAnnotationRecord.parsed_annotation_records_to_model(parse_genbank(fh))) tmp_gff = tmp_path / "tmp.gff" with open(tmp_gff, "w") as fh: collection_to_gff3(parsed, fh, add_sequences=add_sequences) for l1, l2 in zip(open(tmp_gff), open(test_data_dir / gff3)): assert l1 == l2
def test_collection_to_fasta_from_genbank(test_data_dir, tmp_path): """This FASTA export matches exactly because there are no FASTA comments.""" gbk = test_data_dir / "INSC1006_chrI.gbff" with open(gbk, "r") as fh: parsed = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as fh: collection_to_fasta(parsed, fh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa", "r") as fh2: assert fh1.read() == fh2.read()
def test_tbl_export_from_genbank_prokaryotic(test_data_dir, tmp_path, genbank, expected_tbl): genbank = test_data_dir / genbank recs = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(genbank))) tmp = tmp_path / "tmp.tbl" with open(tmp, "w") as fh: collection_to_tbl( recs, fh, locus_tag_prefix="test", submitter_lab_name="inscripta", genbank_flavor=GenbankFlavor.PROKARYOTIC, random_seed=123, ) with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2: assert fh1.read() == fh2.read()
def _produce_empty_records( seqrecords_dict: Dict[str, SeqRecord], seen_seqs: Set[str] ) -> Iterable[ParsedAnnotationRecord]: """ Convenience function shared by :meth:`parse_gff3_embedded_fasta()` and :meth:`parse_gff3_fasta()` that appends empty ``ParsedAnnotationRecord`` objects to the end. This ensures that every sequence in the FASTA is still represented in the final object set, even if it has zero annotations. Args: seqrecords_dict: Dictionary mapping sequence names to SeqRecord objects. seen_seqs: Set of sequences that were found when parsing the GFF3. Yields: Iterable of ``ParsedAnnotationRecord`` objects with empty annotations. """ for sequence_name in seqrecords_dict.keys() - seen_seqs: seqrecord = seqrecords_dict[sequence_name] annot = AnnotationCollectionModel.Schema().load(dict(sequence_name=seqrecord.id, start=0, end=len(seqrecord))) yield ParsedAnnotationRecord(annotation=annot, seqrecord=seqrecord)
def test_collection_to_fasta_from_genbank_fasta_header(test_data_dir, tmp_path): """INSC1003.fa has FASTA comments, and so the sequence will match but the comments will be lost.""" gbk = test_data_dir / "INSC1003.gbk" with open(gbk, "r") as fh: parsed = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as fh: collection_to_fasta(parsed, fh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa", "r") as fh2: f1 = fh1.readlines() f2 = fh2.readlines() assert f1[1:] == f2[1:] assert f1[0] != f2[0] assert f1[0].split()[0] == f2[0].split()[0]
def parse_standard_gff3( gff: Path, gffutil_parse_args: Optional[GffutilsParseArgs] = GffutilsParseArgs(), parse_func: Optional[Callable[[FeatureDB, List[str]], Iterable[AnnotationCollectionModel]]] = default_parse_func, gffutil_transform_func: Optional[Callable[[Feature], Feature]] = None, db_fn: Optional[str] = ":memory:", ) -> Iterable[ParsedAnnotationRecord]: """Parses a GFF3 file using gffutils. The parameters parse_func, gffutil_parse_args are implemented separately for each data source. A default implementation exists in this module. Args: gff: Path to a GFF. Must be local or HTTPS. parse_func: Function that actually converts gffutils to BioCantor representation. gffutil_transform_func: Function that transforms feature keys. Can be necessary in cases where IDs are not unique. gffutil_parse_args: Parsing arguments to pass to gffutils. db_fn: Location to write a gffutils database. Defaults to `:memory:`, which means the database will be built transiently. This value can be set to a file location if memory is a concern, or if you want to retain the gffutils database. It will not be cleaned up. Yields: Iterable of ``ParsedAnnotationRecord`` objects. """ db = gffutils.create_db(str(gff), db_fn, transform=gffutil_transform_func, **gffutil_parse_args.__dict__) if sum(db.count_features_of_type(i) for i in db.featuretypes()) == 0: raise EmptyGFF3Exception("Parsing this GFF3 led to zero features. Is it empty or corrupted?") logger.info(f"Parsed {gff}") for i in db.featuretypes(): logger.info(f"Found feature type {i} with {db.count_features_of_type(i)} features") # get the sequences chrom_query = db.execute("SELECT DISTINCT seqid FROM features") chroms = [x["seqid"] for x in chrom_query] logger.info(f"Found {len(chroms)} sequences") for annot in parse_func(db, chroms): yield ParsedAnnotationRecord(annot)
def group_gene_records_by_locus_tag( record_iter: Iterator[SeqRecord], parse_func: Callable[[GeneFeature], Dict[str, Any]], feature_parse_func: Callable[[FeatureIntervalGenBankCollection], Dict[str, Any]], genbank_parser_type: GenBankParserType = GenBankParserType.LOCUS_TAG, ) -> Iterator[ParsedAnnotationRecord]: """Model 2: ``locus_tag`` defined GenBank. All feature types that qualify within the hierarchical structure, possess a locus_tag, and whose feature type are valid for a known transcribed interval type, will be included in the gene parsing. All other feature types will become generic features (FeatureIntervals), unless we are in hybrid mode. In hybrid mode, locus_tag is used first, then all of the remaining features are sent to the sorted parser. Args: record_iter: Iterator of SeqRecord objects. parse_func: Optional parse function implementation. feature_parse_func: Optional feature interval parse function implementation. genbank_parser_type: Optional parser type. Changing this to GenBankParserType.HYBRID will enable hybrid parsing mode. Yields: :class:`ParsedAnnotationRecord`. """ if genbank_parser_type not in [ GenBankParserType.LOCUS_TAG, GenBankParserType.HYBRID ]: raise GenBankParserError("Must use either locus_tag or hybrid") tot_genes = 0 tot_features = 0 for seqrecord in record_iter: gene_filtered_features = [] remaining_features = [] source = None for f in seqrecord.features: if f.type in GENBANK_GENE_FEATURES and KnownQualifiers.LOCUS_TAG.value in f.qualifiers: gene_filtered_features.append(f) elif f.type == MetadataFeatures.SOURCE.value: source = f else: remaining_features.append(f) sorted_gene_filtered_features = sorted( gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value]) genes = [] for locus_tag, gene_features in itertools.groupby( sorted_gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value][0 ]): # sort the features for this locus tag to bubble the "gene" feature to the top, if it exists gene_features = sorted( gene_features, key=lambda f: f.type != GeneFeatures.GENE.value) # do we have more than one gene with this locus_tag? if len(gene_features ) > 1 and gene_features[1].type == GeneFeatures.GENE.value: raise GenBankLocusTagError( f"Grouping by locus tag found multiple gene features with the same locus tag:" f"\n{gene_features[0]}\n{gene_features[1]}") gene_feature = gene_features[0] if gene_feature.type == GeneFeatures.GENE.value: gene = _construct_gene_from_feature(gene_feature, seqrecord, GeneFeature) else: gene = _construct_gene_from_feature( gene_feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if not gene: continue for feature in gene_features[1:]: if feature.type in TranscriptFeature.types: gene.add_child(feature) elif feature.type in IntervalFeature.types: if len(gene.children) == 0: gene.add_child(feature) else: gene.children[-1].add_child(feature) if gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) if source is not None: source_qualifiers = source.qualifiers else: source_qualifiers = None if genbank_parser_type == GenBankParserType.LOCUS_TAG: feature_collections = _extract_generic_features( seqrecord, remaining_features, feature_parse_func) else: # hybrid parsing mode tmp_seqrecord = deepcopy(seqrecord) tmp_seqrecord.features = remaining_features tmp_annotation = next( group_gene_records_from_sorted_genbank( (tmp_seqrecord, ), parse_func, feature_parse_func)) if tmp_annotation.annotation.feature_collections: feature_collections = [ FeatureIntervalCollectionModel.Schema().dump(x) for x in tmp_annotation.annotation.feature_collections ] else: feature_collections = None if tmp_annotation.annotation.genes: genes.extend([ GeneIntervalModel.Schema().dump(x) for x in tmp_annotation.annotation.genes ]) tot_features += len(feature_collections) if feature_collections else 0 tot_genes += len(genes) if genes else 0 annotation = AnnotationCollectionModel.Schema().load( dict( genes=genes, feature_collections=feature_collections, name=seqrecord.id, sequence_name=seqrecord.id, start=0, end=len(seqrecord), qualifiers=source_qualifiers, )) yield ParsedAnnotationRecord(annotation=annotation, seqrecord=seqrecord) if tot_genes + tot_features == 0: raise EmptyGenBankError( "GenBank parsing produced zero genes and zero features.")
def group_gene_records_from_sorted_genbank( record_iter: Iterator[SeqRecord], parse_func: Callable[[GeneFeature], Dict[str, Any]], feature_parse_func: Callable[[FeatureIntervalGenBankCollection], Dict[str, Any]], ) -> Iterator[ParsedAnnotationRecord]: """Model 1: position sorted GenBank. This function looks for canonical gene records: gene -> Optional(mRNA) -> CDS records It also looks for canonical non-coding records: gene -> {misc_RNA,tRNA,rRNA,etc) It also will infer non-canonical record types, including non-coding transcripts and coding genes from isolated CDS/non-coding features (those without a gene feature before them in the sort order). Any features that do not fit the above bins are interpreted as generic features. Some GenBank files are improperly ordered, and will have things like the CDS feature first, or the mRNA feature first. To try and capture this, the full set of records are sorted first by position, then in the order: gene mRNA CDS exon anything else Args: record_iter: Iterator of SeqRecord objects. parse_func: Optional parse function implementation. feature_parse_func: Optional feature interval parse function implementation. Yields: :class:`ParsedAnnotationRecord`. """ tot_genes = 0 tot_features = 0 for seqrecord in record_iter: gene = None source = None genes = [] # capture non-gene intervals downstream feature_features = [] # sort features to try to capture weirdly ordered genbank files sorted_features = sorted( seqrecord.features, key=lambda x: ( x.location.nofuzzy_start, x.type != GeneFeatures.GENE.value, x.type != TranscriptFeatures.CODING_TRANSCRIPT.value, x.type != GeneIntervalFeatures.CDS.value, x.type != GeneIntervalFeatures.EXON.value, ), ) for feature in sorted_features: # try to capture the Source field, if it exists if feature.type == MetadataFeatures.SOURCE.value: source = feature # base case for start; iterate until we find a gene elif gene is None: if feature.type in GeneFeature.types: gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature) # gene is None if it was not parseable if not gene: continue # base case for starting with a isolated ncRNA or CDS feature; immediately add them # and reset the gene to None elif feature.type in TranscriptFeature.types or feature.type in IntervalFeature.types: gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if gene: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = None # this must be a generic feature else: feature_features.append(feature) # next gene; re-set the gene object and report out the collection elif feature.type in GeneFeature.types: if gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature) if not gene: continue elif feature.type in TranscriptFeature.types: # if the current gene is non-empty, and the feature is not a mRNA, then this is a isolated ncRNA # finish this gene and start a new one if feature.type != TranscriptFeatures.CODING_TRANSCRIPT and gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) gene = _construct_gene_from_feature( feature, seqrecord, GeneFeature.from_transcript_or_cds_feature) # gene is None if it was not parseable if not gene: continue else: gene.add_child(feature) elif feature.type in IntervalFeature.types: if not gene.has_children: gene.add_child(feature) else: gene.children[-1].add_child(feature) else: feature_features.append(feature) # gene could be None if this record has no annotations if gene is not None and gene.has_children: gene.finalize() gene = parse_func(gene) genes.append(gene) if source is not None: source_qualifiers = source.qualifiers else: source_qualifiers = None feature_collections = _extract_generic_features( seqrecord, feature_features, feature_parse_func) tot_features += len(feature_collections) if feature_collections else 0 tot_genes += len(genes) if genes else 0 annotation = AnnotationCollectionModel.Schema().load( dict( genes=genes, feature_collections=feature_collections, sequence_name=seqrecord.id, start=0, end=len(seqrecord), qualifiers=source_qualifiers, )) yield ParsedAnnotationRecord(annotation=annotation, seqrecord=seqrecord) if tot_genes + tot_features == 0: raise EmptyGenBankError( "GenBank parsing produced zero genes and zero features.")