Пример #1
0
 def test_parse_insc1006(self, test_data_dir):
     """INSC1006_chrI is a 4-gene manually built file from INSC1006. It uses the default naive parser."""
     gff = test_data_dir / "INSC1006_chrI.gff3"
     recs = list(parse_standard_gff3(gff))
     c = recs[0].annotation
     with open(test_data_dir / "INSC1006_chrI.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
Пример #2
0
def default_parse_func(db: FeatureDB, chroms: List[str]) -> Iterable[AnnotationCollectionModel]:
    """
    This is the default parser function. Mappings include:

    gene_id -> gene_id
    gene_name or if missing gene_symbol -> gene_symbol
    gene_biotype or if missing gene_type -> gene_biotype
    transcript_id -> transcript_id
    transcript_name or if missing transcript_name -> transcript_symbol
    transcript_biotype or if missing transcript_type -> transcript_biotype
    if no transcript_biotype or transcript_type, then gene result is used

    A list of chromosomes is required in order to allow there to be a specified order of data, otherwise they come back
    unordered from the database.

    Args:
        db: Database from :mod:`gffutils`.
        chroms: List of sequence names to iterate over.

    Yields:
        :class:`~biocantor.io.models.AnnotationCollectionModel`
    """
    non_gene_feature_types = _find_non_gene_feature_types(db)

    for chrom in chroms:
        parsed_genes = _parse_genes(chrom, db)
        if non_gene_feature_types:
            parsed_features = _parse_features(chrom, db, non_gene_feature_types)
        else:
            parsed_features = None

        annot = AnnotationCollectionModel.Schema().load(
            dict(genes=parsed_genes, feature_collections=parsed_features, sequence_name=chrom)
        )
        yield annot
Пример #3
0
 def test_parse_peg10(self, test_data_dir):
     """
     PEG10 is a gene with a -1 frameshift in one isoform, that is parsed using the RefSeq parser.
     """
     gff = test_data_dir / "PEG10_minus1frameshift.gff3"
     rec = list(parse_standard_gff3(gff))[0]
     with open(test_data_dir / "PEG10_minus1frameshift.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
Пример #4
0
 def test_transitive(self, test_data_dir):
     """Test transitive loops"""
     for gff in ["SGCE.gff3", "FRG2B.gff3", "PEG10_minus1frameshift.gff3", "INSC1006_chrI.gff3"]:
         gff = test_data_dir / gff
         recs = list(parse_standard_gff3(gff))
         c = recs[0].to_annotation_collection()
         assert (
             AnnotationCollectionModel.Schema().load(c.to_dict()).to_annotation_collection().to_dict() == c.to_dict()
         )
Пример #5
0
    def test_gff3_strand_error(self, tmp_path):
        tmp_gff3 = tmp_path / "tmp.gff3"

        annot = self.annot.to_annotation_collection().to_dict()
        annot["feature_collections"][0]["feature_intervals"][0][
            "strand"] = "MINUS"
        annot = AnnotationCollectionModel.Schema().load(
            annot).to_annotation_collection()

        with open(tmp_gff3, "w") as fh:
            collection_to_gff3([annot], fh)
        with pytest.raises(GFF3ChildParentMismatchError):
            _ = list(parse_standard_gff3(tmp_gff3))
Пример #6
0
 def test_parse_sgce(self, test_data_dir):
     """
     SGCE is an example of a protein coding gene with multiple isoforms, that is parsed using the RefSeq parser.
     """
     gff = test_data_dir / "SGCE.gff3"
     recs = list(parse_standard_gff3(gff))
     assert len(recs) == 1
     rec = recs[0]
     genes = rec.annotation.genes
     assert len(genes) == 1
     gene = genes[0]
     txs = gene.transcripts
     assert len(txs) == 22
     with open(test_data_dir / "SGCE.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == rec.annotation
Пример #7
0
def _produce_empty_records(
    seqrecords_dict: Dict[str, SeqRecord], seen_seqs: Set[str]
) -> Iterable[ParsedAnnotationRecord]:
    """
    Convenience function shared by :meth:`parse_gff3_embedded_fasta()` and :meth:`parse_gff3_fasta()` that appends
    empty ``ParsedAnnotationRecord`` objects to the end. This ensures that every sequence in the FASTA is still
    represented in the final object set, even if it has zero annotations.

    Args:
        seqrecords_dict: Dictionary mapping sequence names to SeqRecord objects.
        seen_seqs: Set of sequences that were found when parsing the GFF3.

    Yields:
        Iterable of ``ParsedAnnotationRecord`` objects with empty annotations.
    """
    for sequence_name in seqrecords_dict.keys() - seen_seqs:
        seqrecord = seqrecords_dict[sequence_name]
        annot = AnnotationCollectionModel.Schema().load(dict(sequence_name=seqrecord.id, start=0, end=len(seqrecord)))
        yield ParsedAnnotationRecord(annotation=annot, seqrecord=seqrecord)
Пример #8
0
def group_gene_records_by_locus_tag(
    record_iter: Iterator[SeqRecord],
    parse_func: Callable[[GeneFeature], Dict[str, Any]],
    feature_parse_func: Callable[[FeatureIntervalGenBankCollection],
                                 Dict[str, Any]],
    genbank_parser_type: GenBankParserType = GenBankParserType.LOCUS_TAG,
) -> Iterator[ParsedAnnotationRecord]:
    """Model 2: ``locus_tag`` defined GenBank.

    All feature types that qualify within the hierarchical structure, possess a locus_tag, and whose feature type
    are valid for a known transcribed interval type, will be included in the gene parsing.

    All other feature types will become generic features (FeatureIntervals), unless we are in hybrid mode.

    In hybrid mode, locus_tag is used first, then all of the remaining features are sent to the
    sorted parser.

    Args:
        record_iter: Iterator of SeqRecord objects.
        parse_func: Optional parse function implementation.
        feature_parse_func: Optional feature interval parse function implementation.
        genbank_parser_type: Optional parser type. Changing this to GenBankParserType.HYBRID
            will enable hybrid parsing mode.

    Yields:
        :class:`ParsedAnnotationRecord`.
    """
    if genbank_parser_type not in [
            GenBankParserType.LOCUS_TAG, GenBankParserType.HYBRID
    ]:
        raise GenBankParserError("Must use either locus_tag or hybrid")

    tot_genes = 0
    tot_features = 0
    for seqrecord in record_iter:
        gene_filtered_features = []
        remaining_features = []
        source = None
        for f in seqrecord.features:
            if f.type in GENBANK_GENE_FEATURES and KnownQualifiers.LOCUS_TAG.value in f.qualifiers:
                gene_filtered_features.append(f)
            elif f.type == MetadataFeatures.SOURCE.value:
                source = f
            else:
                remaining_features.append(f)

        sorted_gene_filtered_features = sorted(
            gene_filtered_features,
            key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value])

        genes = []
        for locus_tag, gene_features in itertools.groupby(
                sorted_gene_filtered_features,
                key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value][0
                                                                            ]):
            # sort the features for this locus tag to bubble the "gene" feature to the top, if it exists
            gene_features = sorted(
                gene_features, key=lambda f: f.type != GeneFeatures.GENE.value)

            # do we have more than one gene with this locus_tag?
            if len(gene_features
                   ) > 1 and gene_features[1].type == GeneFeatures.GENE.value:
                raise GenBankLocusTagError(
                    f"Grouping by locus tag found multiple gene features with the same locus tag:"
                    f"\n{gene_features[0]}\n{gene_features[1]}")

            gene_feature = gene_features[0]
            if gene_feature.type == GeneFeatures.GENE.value:
                gene = _construct_gene_from_feature(gene_feature, seqrecord,
                                                    GeneFeature)
            else:
                gene = _construct_gene_from_feature(
                    gene_feature, seqrecord,
                    GeneFeature.from_transcript_or_cds_feature)
            # gene is None if it was not parseable
            if not gene:
                continue

            for feature in gene_features[1:]:
                if feature.type in TranscriptFeature.types:
                    gene.add_child(feature)
                elif feature.type in IntervalFeature.types:
                    if len(gene.children) == 0:
                        gene.add_child(feature)
                    else:
                        gene.children[-1].add_child(feature)

            if gene.has_children:
                gene.finalize()
                gene = parse_func(gene)
                genes.append(gene)

        if source is not None:
            source_qualifiers = source.qualifiers
        else:
            source_qualifiers = None

        if genbank_parser_type == GenBankParserType.LOCUS_TAG:
            feature_collections = _extract_generic_features(
                seqrecord, remaining_features, feature_parse_func)
        else:
            # hybrid parsing mode
            tmp_seqrecord = deepcopy(seqrecord)
            tmp_seqrecord.features = remaining_features
            tmp_annotation = next(
                group_gene_records_from_sorted_genbank(
                    (tmp_seqrecord, ), parse_func, feature_parse_func))
            if tmp_annotation.annotation.feature_collections:
                feature_collections = [
                    FeatureIntervalCollectionModel.Schema().dump(x)
                    for x in tmp_annotation.annotation.feature_collections
                ]
            else:
                feature_collections = None
            if tmp_annotation.annotation.genes:
                genes.extend([
                    GeneIntervalModel.Schema().dump(x)
                    for x in tmp_annotation.annotation.genes
                ])

        tot_features += len(feature_collections) if feature_collections else 0
        tot_genes += len(genes) if genes else 0

        annotation = AnnotationCollectionModel.Schema().load(
            dict(
                genes=genes,
                feature_collections=feature_collections,
                name=seqrecord.id,
                sequence_name=seqrecord.id,
                start=0,
                end=len(seqrecord),
                qualifiers=source_qualifiers,
            ))
        yield ParsedAnnotationRecord(annotation=annotation,
                                     seqrecord=seqrecord)

    if tot_genes + tot_features == 0:
        raise EmptyGenBankError(
            "GenBank parsing produced zero genes and zero features.")
Пример #9
0
def group_gene_records_from_sorted_genbank(
    record_iter: Iterator[SeqRecord],
    parse_func: Callable[[GeneFeature], Dict[str, Any]],
    feature_parse_func: Callable[[FeatureIntervalGenBankCollection],
                                 Dict[str, Any]],
) -> Iterator[ParsedAnnotationRecord]:
    """Model 1: position sorted GenBank.

    This function looks for canonical gene records:
        gene -> Optional(mRNA) -> CDS records
    It also looks for canonical non-coding records:
        gene -> {misc_RNA,tRNA,rRNA,etc)

    It also will infer non-canonical record types, including non-coding transcripts and coding genes
    from isolated CDS/non-coding features (those without a gene feature before them in the sort order).

    Any features that do not fit the above bins are interpreted as generic features.

    Some GenBank files are improperly ordered, and will have things like the CDS feature first, or the mRNA feature
    first. To try and capture this, the full set of records are sorted first by position, then in the order:

    gene
    mRNA
    CDS
    exon
    anything else

    Args:
        record_iter: Iterator of SeqRecord objects.
        parse_func: Optional parse function implementation.
        feature_parse_func: Optional feature interval parse function implementation.

    Yields:
        :class:`ParsedAnnotationRecord`.
    """
    tot_genes = 0
    tot_features = 0
    for seqrecord in record_iter:
        gene = None
        source = None
        genes = []
        # capture non-gene intervals downstream
        feature_features = []

        # sort features to try to capture weirdly ordered genbank files
        sorted_features = sorted(
            seqrecord.features,
            key=lambda x: (
                x.location.nofuzzy_start,
                x.type != GeneFeatures.GENE.value,
                x.type != TranscriptFeatures.CODING_TRANSCRIPT.value,
                x.type != GeneIntervalFeatures.CDS.value,
                x.type != GeneIntervalFeatures.EXON.value,
            ),
        )
        for feature in sorted_features:
            # try to capture the Source field, if it exists
            if feature.type == MetadataFeatures.SOURCE.value:
                source = feature
            # base case for start; iterate until we find a gene
            elif gene is None:
                if feature.type in GeneFeature.types:
                    gene = _construct_gene_from_feature(
                        feature, seqrecord, GeneFeature)
                    # gene is None if it was not parseable
                    if not gene:
                        continue
                # base case for starting with a isolated ncRNA or CDS feature; immediately add them
                # and reset the gene to None
                elif feature.type in TranscriptFeature.types or feature.type in IntervalFeature.types:
                    gene = _construct_gene_from_feature(
                        feature, seqrecord,
                        GeneFeature.from_transcript_or_cds_feature)
                    # gene is None if it was not parseable
                    if gene:
                        gene.finalize()
                        gene = parse_func(gene)
                        genes.append(gene)
                        gene = None
                # this must be a generic feature
                else:
                    feature_features.append(feature)
            # next gene; re-set the gene object and report out the collection
            elif feature.type in GeneFeature.types:
                if gene.has_children:
                    gene.finalize()
                    gene = parse_func(gene)
                    genes.append(gene)
                gene = _construct_gene_from_feature(feature, seqrecord,
                                                    GeneFeature)
                if not gene:
                    continue
            elif feature.type in TranscriptFeature.types:
                # if the current gene is non-empty, and the feature is not a mRNA, then this is a isolated ncRNA
                # finish this gene and start a new one
                if feature.type != TranscriptFeatures.CODING_TRANSCRIPT and gene.has_children:
                    gene.finalize()
                    gene = parse_func(gene)
                    genes.append(gene)
                    gene = _construct_gene_from_feature(
                        feature, seqrecord,
                        GeneFeature.from_transcript_or_cds_feature)
                    # gene is None if it was not parseable
                    if not gene:
                        continue
                else:
                    gene.add_child(feature)
            elif feature.type in IntervalFeature.types:
                if not gene.has_children:
                    gene.add_child(feature)
                else:
                    gene.children[-1].add_child(feature)
            else:
                feature_features.append(feature)

        # gene could be None if this record has no annotations
        if gene is not None and gene.has_children:
            gene.finalize()
            gene = parse_func(gene)
            genes.append(gene)

        if source is not None:
            source_qualifiers = source.qualifiers
        else:
            source_qualifiers = None

        feature_collections = _extract_generic_features(
            seqrecord, feature_features, feature_parse_func)

        tot_features += len(feature_collections) if feature_collections else 0
        tot_genes += len(genes) if genes else 0

        annotation = AnnotationCollectionModel.Schema().load(
            dict(
                genes=genes,
                feature_collections=feature_collections,
                sequence_name=seqrecord.id,
                start=0,
                end=len(seqrecord),
                qualifiers=source_qualifiers,
            ))
        yield ParsedAnnotationRecord(annotation=annotation,
                                     seqrecord=seqrecord)

    if tot_genes + tot_features == 0:
        raise EmptyGenBankError(
            "GenBank parsing produced zero genes and zero features.")
Пример #10
0
class TestFeatures:
    feat1 = dict(
        interval_starts=[2],
        interval_ends=[5],
        strand=Strand.PLUS.name,
        feature_types=["a", "b"],
        feature_name="feat1",
        sequence_name="chr1",
    )
    feat2 = dict(
        interval_starts=[2, 7, 12],
        interval_ends=[6, 10, 15],
        strand=Strand.PLUS.name,
        feature_types=["b"],
        feature_name="feat2",
        sequence_name="chr1",
    )
    featcollection = dict(
        feature_intervals=[feat1, feat2],
        feature_collection_name="featgrp1",
        feature_collection_id="abc123",
        feature_collection_type="group",
        sequence_name="chr1",
    )

    tx1 = dict(
        exon_starts=[2],
        exon_ends=[18],
        strand=Strand.PLUS.name,
        cds_starts=[5],
        cds_ends=[9],
        cds_frames=[CDSFrame.ZERO.name],
        sequence_name="chr1",
        transcript_symbol="tx1",
    )
    tx2 = dict(
        exon_starts=[2, 7, 12],
        exon_ends=[6, 10, 15],
        strand=Strand.PLUS.name,
        cds_starts=[4, 7, 12],
        cds_ends=[6, 10, 13],
        cds_frames=[CDSFrame.ZERO.name, CDSFrame.TWO.name, CDSFrame.TWO.name],
        sequence_name="chr1",
        transcript_symbol="tx2",
    )
    gene = dict(
        transcripts=[tx1, tx2],
        qualifiers={},
        gene_type=Biotype.protein_coding.name,
        gene_id="gene1",
        sequence_name="chr1",
    )

    annot = AnnotationCollectionModel.Schema().load(
        dict(feature_collections=[featcollection],
             genes=[gene],
             sequence_name="chr1"))

    def test_gff3_export(self, test_data_dir):
        """Test that the GFF3 export did not change over time"""
        gff = test_data_dir / "feature_test_2.gff"
        lines = [x.rstrip() for x in open(gff) if not x.startswith("#")]
        assert lines == [
            str(x) for x in self.annot.to_annotation_collection().to_gff()
        ]

    def test_gff3_locus_tag_error(self, test_data_dir):
        with pytest.raises(GFF3LocusTagError):
            _ = list(
                parse_standard_gff3(test_data_dir /
                                    "feature_test_locus_tag_error.gff"))

    def test_gff3_strand_error(self, tmp_path):
        tmp_gff3 = tmp_path / "tmp.gff3"

        annot = self.annot.to_annotation_collection().to_dict()
        annot["feature_collections"][0]["feature_intervals"][0][
            "strand"] = "MINUS"
        annot = AnnotationCollectionModel.Schema().load(
            annot).to_annotation_collection()

        with open(tmp_gff3, "w") as fh:
            collection_to_gff3([annot], fh)
        with pytest.raises(GFF3ChildParentMismatchError):
            _ = list(parse_standard_gff3(tmp_gff3))

    def test_gff3_chromosome(self, tmp_path):
        tmp_gff3 = tmp_path / "tmp.gff3"

        annot = self.annot.to_annotation_collection().to_dict()
        annot["feature_collections"][0]["feature_intervals"][0][
            "sequence_name"] = "chr2"
        annot = AnnotationCollectionModel.Schema().load(
            annot).to_annotation_collection()

        with open(tmp_gff3, "w") as fh:
            collection_to_gff3([annot], fh)
        with pytest.raises(GFF3ChildParentMismatchError):
            _ = list(parse_standard_gff3(tmp_gff3))
Пример #11
0
 def test_direct_cds_exon(self, test_data_dir):
     recs = list(parse_standard_gff3(test_data_dir / "gene_cds_direct_child.gff3"))
     c = recs[0].annotation
     with open(test_data_dir / "gene_cds_direct_child.json") as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c
Пример #12
0
 def test_parse_feature_tests(self, test_data_dir, gff3, json_file):
     recs = list(parse_standard_gff3(test_data_dir / gff3))
     c = recs[0].annotation
     with open(test_data_dir / json_file) as fh:
         assert AnnotationCollectionModel.Schema().load(json.load(fh)) == c