Exemplo n.º 1
0
    def _load_gtf_as_dataframe(self, usecols=None, features=None):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True,
            usecols=usecols,
            features=features)

        column_names = set(df.keys())
        expect_gene_feature = features is None or "gene" in features
        expect_transcript_feature = features is None or "transcript" in features
        observed_features = set(df["feature"])

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if expect_gene_feature and "gene" not in observed_features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
                    "gene": {
                        "gene_name",
                        "gene_biotype"
                    }.intersection(column_names),
                },
                missing_value="")
            logger.info("Done.")

        if expect_transcript_feature and "transcript" not in observed_features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
Exemplo n.º 2
0
    def _load_gtf_as_dataframe(self, usecols=None, features=None):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf(self.gtf_path,
                      column_converters={
                          "seqname": normalize_chromosome,
                          "strand": normalize_strand,
                      },
                      infer_biotype_column=True,
                      usecols=usecols,
                      features=features)

        column_names = set(df.keys())
        expect_gene_feature = features is None or "gene" in features
        expect_transcript_feature = features is None or "transcript" in features
        observed_features = set(df["feature"])

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if expect_gene_feature and "gene" not in observed_features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(dataframe=df,
                                         unique_keys={"gene": "gene_id"},
                                         extra_columns={
                                             "gene":
                                             {"gene_name", "gene_biotype"
                                              }.intersection(column_names),
                                         },
                                         missing_value="")
            logger.info("Done.")

        if expect_transcript_feature and "transcript" not in observed_features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
Exemplo n.º 3
0
    def _load_full_dataframe_from_gtf(self):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        print("Reading GTF from %s" % self.gtf_path)
        df = read_gtf_as_dataframe(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True)

        features = set(df["feature"])
        column_names = set(df.keys())

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if "gene" not in features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
                    "gene": {
                        "gene_name",
                        "gene_biotype"
                    }.intersection(column_names),
                },
                missing_value="")

        if "transcript" not in features:
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
        return df
Exemplo n.º 4
0
    def _load_full_dataframe_from_gtf(self):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        df = read_gtf_as_dataframe(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True)

        features = set(df["feature"])
        column_names = set(df.keys())

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if "gene" not in features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
                    "gene": {
                        "gene_name",
                        "gene_biotype"
                    }.intersection(column_names),
                },
                missing_value="")

        if "transcript" not in features:
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
        return df
def test_create_missing_features():
    assert "gene" not in set(GTF_DATAFRAME["feature"]), \
        "Original GTF should not contain gene feature"
    assert "transcript" not in set(GTF_DATAFRAME["feature"]), \
        "Original GTF should not contain transcript feature"
    df_extra_features = create_missing_features(
        GTF_DATAFRAME,
        unique_keys={
            "gene": "gene_id",
            "transcript": "transcript_id"
        },
        extra_columns={
            "gene": {"gene_name"},
            "transcript": {"gene_id", "gene_name", "transcript_name"},
        })
    _check_expanded_dataframe(df_extra_features)
def test_create_missing_features():
    assert "gene" not in set(GTF_DATAFRAME["feature"]), \
        "Original GTF should not contain gene feature"
    assert "transcript" not in set(GTF_DATAFRAME["feature"]), \
        "Original GTF should not contain transcript feature"
    df_extra_features = create_missing_features(
        GTF_DATAFRAME,
        unique_keys={
            "gene": "gene_id",
            "transcript": "transcript_id"
        },
        extra_columns={
            "gene": {"gene_name"},
            "transcript": {"gene_id", "gene_name", "transcript_name"},
        })
    _check_expanded_dataframe(df_extra_features)
def test_create_missing_features_identity():
    df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
    assert len(GTF_DATAFRAME) == len(df_should_be_same), \
        "GTF DataFrames should be same size"
def test_create_missing_features_identity():
    df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
    assert len(GTF_DATAFRAME) == len(df_should_be_same), \
        "GTF DataFrames should be same size"