예제 #1
0
def test_converter_convert_list():
    """Try to convert a list of Ensembl IDs to gene symbols."""
    gene_ids = ['ENSG00000000003.14', 'ENSG00000000005.5', 'ENSG00000000419.12']
    gene_symbols = ['TSPAN6', 'TNMD', 'DPM1']

    cleaned_ids = convert.clean_ensembl_ids(gene_ids)
    converter = convert.IDConverter('ensembl_gene_id', 'symbol')
    assert converter.convert_list(cleaned_ids) == gene_symbols
예제 #2
0
def test_converter_convert():
    """Try to convert an example Ensembl ID to a gene symbol."""
    gene_id = 'ENSG00000000003.14'
    gene_symbol = 'TSPAN6'

    cleaned_id = convert.clean_ensembl_id(gene_id)
    converter = convert.IDConverter('ensembl_gene_id', 'symbol')
    assert converter.convert(cleaned_id) == gene_symbol
예제 #3
0
def test_converter_all_targets():
    """Try to convert an example Ensembl ID to all allowed identifier types."""
    conversion_targets = convert.IDConverter.potential_ids
    gene_id = 'ENSG00000000003.14'
    cleaned_id = convert.clean_ensembl_id(gene_id)

    for target_id in conversion_targets:
        converter = convert.IDConverter('ensembl_gene_id', target_id)
        converter.convert(cleaned_id)
예제 #4
0
def make_godict(gofile, force=False):
    """
    Parses the Gene Ontology file and creates a dictionary that is easier
    to work with. Saves the dictionary as a json file.

    Notes:

        uniprot id: column 1
        gene symbol: column 2
        GO Evidence codes: column 5

        Experiment:
            Inferred from Experiment (EXP)
            Inferred from Direct Assay (IDA)
            Inferred from Physical Interaction (IPI)
            Inferred from Mutant Phenotype (IMP)
            Inferred from Genetic Interaction (IGI)
            Inferred from Expression Pattern (IEP)

        Computational:
            Inferred from Sequence or structural Similarity (ISS)
            Inferred from Sequence Orthology (ISO)
            Inferred from Sequence Alignment (ISA)
            Inferred from Sequence Model (ISM)
            Inferred from Genomic Context (IGC)
            Inferred from Biological aspect of Ancestor (IBA)
            Inferred from Biological aspect of Descendant (IBD)
            Inferred from Key Residues (IKR)
            Inferred from Rapid Divergence(IRD)
            Inferred from Reviewed Computational Analysis (RCA)

        Literature:
            Traceable Author Statement (TAS)
            Non-traceable Author Statement (NAS)

        Other:
            Inferred by Curator (IC)
            No biological Data available (ND) evidence code
            Inferred from Electronic Annotation (IEA)

    Args:
        gofile (str): path to the gene ontology file
        force (optional; bool): overwrite the json file if true

    Returns:
        None

    """
    from genemunge import convert
    converter = convert.IDConverter('uniprot_ids', 'ensembl_gene_id')

    # check if the outputfile already exists
    if not force and os.path.exists(OUTPUTFILE):
        return

    # id: {name, namespace, def, parents, children, genes}
    # connections (parent/child): 'is_a' or 'part_of'
    # ignore if 'is_obsolete: true'

    # read in the ontology file
    with open(gofile, "r") as go:
        unparsed = [line.rstrip() for line in go]

    # find the indices marking the beginning of each term
    indices = [i for i, x in enumerate(unparsed) if begins_with_pattern(x, "id:")]

    # group the terms
    grouped = [unparsed[indices[i]: indices[i+1]] for i in range(len(indices)-1)]

    # get rid of obselete terms
    not_obsolete = [g for g in grouped if first_match(g, obsolete) is None]

    # get rid of any term that doesn't have ids
    has_id = [g for g in not_obsolete if first_match(g, go_id) is not None]

    # create the go dictionary
    godict = {}
    for group in has_id:
        parse_group(group, godict)

    # add the children terms
    for term in godict:
        parents = godict[term]['parents']
        for p in parents:
            if term not in godict[p]['children']:
                godict[p]['children'] += [term]

    # add the annotations
    with gzip.open(ANNOTATIONFILE ,'rb') as annotfile:
        for raw_line in annotfile:
            line = raw_line.decode('utf-8')
            if line[0] != '!': # comments
                parsed = line.strip().split('\t')

                database = parsed[0] # currently, this is always UniProtKB
                database_id = parsed[1]
                symbol = parsed[2] # ORF for unnamed
                qualifier = parsed[3]
                go_term = parsed[4]
                database_reference = parsed[5]
                evidence = parsed[6]

                # what to do about colocalizes_with and contributes_to?
                if 'NOT' not in qualifier:
                    try:
                        ensembl = converter.convert(database_id)
                        # add the identifier if it is not NaN
                        if ensembl == ensembl:
                            godict[go_term]['genes'][evidence] += [ensembl]
                    except KeyError:
                        # we have filtered out obsolete go terms
                        # therefore, we have to catch this exception
                        pass

    # write to the file
    with open(OUTPUTFILE, "w") as outfile:
        json.dump(godict, outfile)
예제 #5
0
def test_converter_construct():
    """Try to construct an IDConverter object."""
    converter = convert.IDConverter('symbol', 'name')