示例#1
0
def get_gene_name_of_transcript_id(transcript_id: str,
                                   ensembl: pyensembl.Genome,
                                   raise_on_error: bool = False):
    """ Extract the gene name (symbol) for this transcript id.

    The difference between this function and gene_name_of_transcript_id is that
    this function will (optionally) issue a warning rather than raise an
    exception for transcript ids not in the database.

    Parameters
    ----------
    transcript_id: string
        The transcript identifier (e.g., "ENSMUST00000035194")

    ensembl: pyensembl.Genome
        The annotation database

    raise_on_error: bool
        Whether to issue a warning (False) or raise a ValueError (True) if the
        transcript identifier is not in the annotation database

    Returns
    -------
    gene_name: string
        The gene name (also called gene symbol, e.g., "Mapkapk3")

    --- OR ---

    None, if the transcript id is not in the database of annotations
    """
    gene_name = None
    try:
        gene_name = ensembl.gene_name_of_transcript_id(transcript_id)
    except ValueError as ve:
        msg = ("[pyensembl_utils.get_gene_name_of_transcript_id]: could not "
               "find match for transcript id: {}".format(transcript_id))

        if raise_on_error:
            raise ValueError(msg) from ve
        else:
            logger.warning(msg)

    return gene_name
示例#2
0
def get_gene_ids_of_transcript_id(transcript_id: str,
                                  ensembl: pyensembl.Genome,
                                  raise_on_error: bool = False):
    """ Extract all gene ids associated with the given transcript.

    Parameters
    ----------
    transcript_id: string
        The transcript identifier

    ensembl: pyensembl.Genome
        The annotations

    raise_on_error: bool
        Whether to raise an exception if the transcript id is not found in the
        annotations database

    Returns
    -------
    transcript_gene_id_df: pd.DataFrame
        A dataframe with columns to map between transcripts and genes. Its
        columns are:

            transcript_id
            gene_id
    """
    try:
        gene_name = ensembl.gene_name_of_transcript_id(transcript_id)
        gene_ids = ensembl.gene_ids_of_gene_name(gene_name)
    except ValueError as ve:
        msg = ("['pyensembl_utils.get_gene_ids_of_transcript_id]: could not "
               "find transcript id in database: {}".format(transcript_id))
        if raise_on_error:
            raise ValueError(msg) from ve
        else:
            logger.warning(msg)
            return None

    ret = [{'transcript_id': transcript_id, 'gene_id': g} for g in gene_ids]

    return ret