예제 #1
0
def run_phenolog(ont, aset, args):
    """
    Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set.
    """
    ofactory = OntologyFactory()
    ont2 = ofactory.create(args.resource2)

    afactory = AssociationSetFactory()
    aset2 = afactory.create(ontology=ont2, file=args.file2)

    # only test for genes (or other subjects of statements) in common
    common = set(aset.subjects).intersection(aset2.subjects)
    num_common = len(common)
    logging.info("Genes in common between two KBs: {}/\{} = {}".format(
        len(aset.subjects), len(aset2.subjects), num_common))
    if num_common < 2:
        logging.error("TOO FEW")
        return None
    for n in aset.ontology.nodes():
        nl = ont.label(n, id_if_null=True)
        genes = aset.query([n])
        num_genes = len(genes)
        if num_genes > 2:
            logging.info("BASE: {} {} num={}".format(n, nl, num_genes))
            enr = aset2.enrichment_test(subjects=genes,
                                        background=aset2.subjects,
                                        labels=True)
            for r in enr:
                print("{:8.3g} {} {:20s} <-> {} {:20s}".format(
                    r['p'], n, nl, r['c'], str(r['n'])))
    def get(self):
        """
        Summary statistics for objects associated
        """
        args = parser.parse_args()

        M = GolrFields()
        ont = None
        ocat = args.get('object_category')
        ontid = args.get('ontology')
        if ontid is None:
            if ocat == 'function':
                ontid = 'go'
            if ocat == 'phenotype':
                # TODO: other phenotype ontologies
                ontid = 'hp'

        print("Loading: {}".format(ontid))
        ont = get_ontology(ontid)
        taxid = args.get('taxon')
        max_p_value = float(args.max_p_value)

        subjects = args.get('subject')
        background = args.get('background')
        afactory = AssociationSetFactory()
        aset = afactory.create(ontology=ont,
                               subject_category='gene',
                               object_category=ocat,
                               taxon=taxid)
        enr = aset.enrichment_test(subjects=subjects,
                                   threshold=max_p_value,
                                   labels=True)
        return {'results': enr}
예제 #3
0
def test_create_from_file_no_fmt():
    """
    Test loading from gaf while setting fmt to None
    """
    ont = OntologyFactory().create('go')
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt=None, file=POMBASE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 100
예제 #4
0
def test_remote_disease():
    """
    factory test
    """
    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    ont = ofactory.create('doid')
    aset = afactory.create(ontology=ont,
                           subject_category='disease',
                           object_category='phenotype',
                           taxon=HUMAN)

    rs = aset.query_associations([PD])
    print("Gene Assocs to PD: {} {}".format(rs, len(rs)))
    def load_associations(self, ontology_name:str=None, subject_category:str=None, object_category:str=None, evidence=None, taxon:str=None, relation=None, file:Union[str, TextIO]=None, fmt:str=None, skim:bool=False) -> None:
        ofactory = OntologyFactory()
        afactory = AssociationSetFactory()

        ontology = ofactory.create(ontology_name, subject_category)

        self.associations = afactory.create(
            ontology=ontology,
            subject_category=subject_category,
            object_category=object_category,
            evidence=evidence,
            taxon=taxon,
            relation=relation,
            file=file,
            fmt=fmt,
            skim=skim
        )
예제 #6
0
def test_remote_go():
    """
    factory test
    """
    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    ont = ofactory.create('go').subontology(relations=['subClassOf', PART_OF])
    aset = afactory.create(ontology=ont,
                           subject_category='gene',
                           object_category='function',
                           taxon=MOUSE)

    rs = aset.query([TRANSCRIPTION_FACTOR], [])
    print("Mouse genes annotated to TF: {} {}".format(rs, len(rs)))
    for g in rs:
        print("  Gene: {} {}".format(g, aset.label(g)))
    set_tf = rs

    rs = aset.query([NUCLEUS], [])
    print("Mouse genes annotated to nucleus: {} {}".format(rs, len(rs)))
    set_nucleus = rs
    assert (len(rs) > 100)

    rs = aset.query([TRANSCRIPTION_FACTOR, NUCLEUS], [])
    print("Mouse TF genes annotated to nucleus: {} {}".format(rs, len(rs)))
    assert (len(rs) > 100)
    set_nucleus_tf = rs
    assert (len(rs) < len(set_nucleus))

    rs = aset.query([NUCLEUS], [TRANSCRIPTION_FACTOR])
    print("Mouse non-TF genes annotated to nucleus: {} {}".format(rs, len(rs)))
    assert (len(rs) > 100)
    set_nucleus_non_tf = rs
    assert (len(rs) < len(set_nucleus))
    assert (len(set_nucleus_tf) + len(set_nucleus_non_tf) == len(set_nucleus))

    enr = aset.enrichment_test(subjects=set_tf, labels=True)
    print("ENRICHMENT (tf): {}".format(enr))
    [match] = [x for x in enr if x['c'] == NUCLEUS]
    print("ENRICHMENT (tf) for NUCLEUS: {}".format(match))
    assert match['p'] < 0.00001
예제 #7
0
def main():
    """
    Wrapper for OGR Assocs
    """
    parser = argparse.ArgumentParser(
        description='Wrapper for obographs assocmodel library'
        """
                                                 By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint
                                                 """,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-r',
                        '--resource',
                        type=str,
                        required=False,
                        help='Name of ontology')
    parser.add_argument('-f',
                        '--assocfile',
                        type=str,
                        required=False,
                        help='Name of input file for associations')
    parser.add_argument(
        '--assocformat',
        type=str,
        default='gaf',
        required=False,
        help='Format of association file, if passed (default: gaf)')
    parser.add_argument('-o',
                        '--outfile',
                        type=str,
                        required=False,
                        help='Path to output file')
    parser.add_argument('-t',
                        '--to',
                        type=str,
                        required=False,
                        help='Output to (tree, dot, ...)')
    parser.add_argument('-d',
                        '--direction',
                        type=str,
                        default='u',
                        required=False,
                        help='u = up, d = down, ud = up and down')
    parser.add_argument('-e',
                        '--evidence',
                        type=str,
                        required=False,
                        help='ECO class')
    parser.add_argument('-p',
                        '--properties',
                        nargs='*',
                        type=str,
                        required=False,
                        help='Properties')
    parser.add_argument('-P',
                        '--plot',
                        type=bool,
                        default=False,
                        help='if set, plot output (requires plotly)')
    parser.add_argument('-y',
                        '--yamlconfig',
                        type=str,
                        required=False,
                        help='Path to setup/configuration yaml file')
    parser.add_argument('-S',
                        '--slim',
                        type=str,
                        default='',
                        required=False,
                        help='Slim type. m=minimal')
    parser.add_argument('-c',
                        '--container_properties',
                        nargs='*',
                        type=str,
                        required=False,
                        help='Properties to nest in graph')
    parser.add_argument('-C',
                        '--category',
                        nargs=2,
                        type=str,
                        required=False,
                        help='category tuple (SUBJECT OBJECT)')
    parser.add_argument('-T',
                        '--taxon',
                        type=str,
                        required=False,
                        help='Taxon of associations')
    parser.add_argument('-v',
                        '--verbosity',
                        default=0,
                        action='count',
                        help='Increase output verbosity')

    subparsers = parser.add_subparsers(dest='subcommand',
                                       help='sub-command help')

    # EXTRACT ONTOLOGY
    parser_n = subparsers.add_parser(
        'subontology',
        help=
        'Extract sub-ontology, include only annotated nodes or their descendants'
    )
    parser_n.add_argument('-M',
                          '--minimal',
                          dest='minimal',
                          action='store_true',
                          default=False,
                          help='If set, remove non-MRCA nodes')
    parser_n.set_defaults(function=extract_ontology)

    # ENRICHMENT
    parser_n = subparsers.add_parser(
        'enrichment',
        help=
        'Perform an enrichment test over a sample set of annotated entities')
    parser_n.add_argument(
        '-q',
        '--query',
        type=str,
        help='query all genes for this class an use as subject')
    parser_n.add_argument('-H',
                          '--hypotheses',
                          nargs='*',
                          help='list of classes to test against')
    parser_n.add_argument(
        '-s',
        '--sample_file',
        type=str,
        help='file containing list of gene IDs in sample set')
    parser_n.add_argument(
        '-b',
        '--background_file',
        type=str,
        help='file containing list of gene IDs in background set')
    parser_n.add_argument('-t',
                          '--threshold',
                          type=float,
                          help='p-value threshold')
    parser_n.add_argument('sample_ids',
                          nargs='*',
                          help='list of gene IDs in sample set')
    parser_n.set_defaults(function=run_enrichment_test)

    # PHENOLOG
    parser_n = subparsers.add_parser(
        'phenolog',
        help=
        'Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets'
    )
    parser_n.add_argument('-R',
                          '--resource2',
                          type=str,
                          required=True,
                          help='path to second GAF')
    parser_n.add_argument('-F',
                          '--file2',
                          type=str,
                          required=True,
                          help='handle for second ontology')
    parser_n.set_defaults(function=run_phenolog)

    # QUERY
    parser_n = subparsers.add_parser(
        'query',
        help=
        'Query for entities (e.g. genes) based on positive and negative terms')
    parser_n.add_argument('-q', '--query', nargs='*', help='positive classes')
    parser_n.add_argument('-N',
                          '--negative',
                          type=str,
                          help='negative classes')
    parser_n.set_defaults(function=run_query)

    # QUERY ASSOCIATIONS
    parser_n = subparsers.add_parser(
        'associations',
        help='Query for associations for a set of entities (e.g. genes)')
    parser_n.add_argument('subjects', nargs='*', help='subject ids')
    parser_n.add_argument('-D', '--dendrogram', type=bool, default=False)
    parser_n.set_defaults(function=run_query_associations)

    # INTERSECTIONS
    parser_n = subparsers.add_parser('intersections',
                                     help='Query intersections')
    parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes')
    parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('terms', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_intersections)

    # INTERSECTION DENDROGRAM (TODO: merge into previous?)
    parser_n = subparsers.add_parser('intersection-dendrogram',
                                     help='Plot dendrogram from intersections')
    parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes')
    parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('terms', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_term_intersection_dendrogram)

    # SIMILARITY MATRIX (may move to another module)
    parser_n = subparsers.add_parser(
        'simmatrix', help='Plot dendrogram for similarities between subjects')
    parser_n.add_argument('-X', '--xsubjects', nargs='*', help='x subjects')
    parser_n.add_argument('-Y', '--ysubjects', nargs='*', help='y subjects')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('subjects', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_simmatrix)

    args = parser.parse_args()

    if args.verbosity >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    if not args.assocfile:
        if not args.taxon or not args.category:
            raise ValueError(
                "Must specify EITHER assocfile OR both taxon and category")

    logging.info("Welcome!")

    if args.yamlconfig is not None:
        logging.info("Setting config from: {}".format(args.yamlconfig))
        # note this sets a global:
        # we would not do this outside the context of a standalone script
        from ontobio.config import set_config
        set_config(args.yamlconfig)

    handle = args.resource

    # Ontology Factory
    ofactory = OntologyFactory()
    logging.info("Creating ont object from: {} {}".format(handle, ofactory))
    ont = ofactory.create(handle)
    logging.info("ont: {}".format(ont))

    evidence = args.evidence
    if evidence is not None and evidence.lower() == 'noiea':
        evidence = "-ECO:0000501"

    # Association Factory
    afactory = AssociationSetFactory()
    aset = None
    if args.assocfile is not None:
        aset = afactory.create_from_file(file=args.assocfile,
                                         fmt=args.assocformat,
                                         ontology=ont)
    else:
        [subject_category, object_category] = args.category
        # create using GO/Monarch services
        aset = afactory.create(ontology=ont,
                               subject_category=subject_category,
                               object_category=object_category,
                               taxon=args.taxon)

    func = args.function
    func(ont, aset, args)
예제 #8
0
from ontobio.assoc_factory import AssociationSetFactory



HUMAN = 'NCBITaxon:9606'

#ontology paths
##''/Users/marcin/Documents/VIMSS/ontology/NCATS/HPO/hp.obo')#mondo#hp

ofactory = OntologyFactory()
afactory = AssociationSetFactory()
print("creating...")
ont = ofactory.create('hp')
#ont = ofactory.create('mondo')
aset = afactory.create(ontology=ont,
                       subject_category='disease',
                       object_category='phenotype',
                       taxon=HUMAN)

###aset = afactory.create_from_gaf('my.gaf', ontology=ont)

                
disease_ids = ["DECIPHER:1", "DECIPHER:16", "OMIM:614696", "OMIM:614699", "Orphanet:99978"]
phenotype_ids = ["HP:0000007", "Orphanet:93299", "Orphanet:90794"]
print("annotations\t"+phenotype_ids[1])

print(aset.annotations(phenotype_ids[1]))
#print(ont.equiv_graph())



#sys.exit()
예제 #9
0
def test_factory():
    ont = OntologyFactory().create(ONT)
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt='hpoa', file=ANNFILE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 40
예제 #10
0
def main():
    """
    Phenologs
    """
    parser = argparse.ArgumentParser(
        description='Phenologs'
        """
                                                 By default, ontologies are cached locally and synced from a remote sparql endpoint
                                                 """,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-r',
                        '--resource1',
                        type=str,
                        required=False,
                        help='Name of ontology1')
    parser.add_argument('-R',
                        '--resource2',
                        type=str,
                        required=False,
                        help='Name of ontology2')
    parser.add_argument('-T',
                        '--taxon',
                        type=str,
                        default='NCBITaxon:10090',
                        required=False,
                        help='NCBITaxon ID')
    parser.add_argument('-s',
                        '--search',
                        type=str,
                        default='',
                        required=False,
                        help='Search type. p=partial, r=regex')
    parser.add_argument('-b',
                        '--background',
                        type=str,
                        default=None,
                        required=False,
                        help='Class to use for background')
    parser.add_argument('-p',
                        '--pthreshold',
                        type=float,
                        default=0.05,
                        required=False,
                        help='P-value threshold')
    parser.add_argument('-v',
                        '--verbosity',
                        default=0,
                        action='count',
                        help='Increase output verbosity')

    parser.add_argument('ids', nargs='*')

    args = parser.parse_args()

    if args.verbosity >= 2:
        logging.basicConfig(level=logging.DEBUG)
    if args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    logging.info("Welcome!")

    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    handle = args.resource1
    ont1 = ofactory.create(args.resource1)
    ont2 = ofactory.create(args.resource2)
    logging.info("onts: {} {}".format(ont1, ont2))
    searchp = args.search

    category = 'gene'

    aset1 = afactory.create(ontology=ont1,
                            subject_category=category,
                            object_category='phenotype',
                            taxon=args.taxon)
    aset2 = afactory.create(ontology=ont2,
                            subject_category=category,
                            object_category='function',
                            taxon=args.taxon)

    bg_cls = None
    if args.background is not None:
        bg_ids = resolve(ont1, [args.background], searchp)
        if len(bg_ids) == 0:
            logging.error("Cannnot resolve: '{}' using {} in {}".format(
                args.background, searchp, ont1))
            sys.exit(1)
        elif len(bg_ids) > 1:
            logging.error("Multiple matches: '{}' using {} MATCHES={}".format(
                args.background, searchp, bg_ids))
            sys.exit(1)
        else:
            logging.info("Background: {}".format(bg_cls))
            [bg_cls] = bg_ids

    for id in resolve(ont1, args.ids, searchp):

        sample = aset1.query([id], [])
        print("Gene set class:{} Gene set: {}".format(id, sample))
        bg = None
        if bg_cls is not None:
            bg = aset1.query([bg_cls], [])
            print("BACKGROUND SUBJECTS: {}".format(bg))

        rs = aset2.enrichment_test(sample,
                                   bg,
                                   threshold=args.pthreshold,
                                   labels=True)
        print("RESULTS: {} < {}".format(len(rs), args.pthreshold))
        for r in rs:
            print(str(r))
# renderer.write(wd_ontology)
# >> AttributeError: 'EagerWikidataOntology' object has no attribute 'all_logical_definitions'

renderer.write_subgraph(wd_ontology, nodes, query_ids=qids)

# Get GO terms
outfile = open('./output/go-terms.tsv', 'w')

[ptsd] = wd_ontology.search('Sickle Cell Anemia')
proteins = wd.canned_query('disease2protein', ptsd)

go = onto_factory.create('go')

afactory = AssociationSetFactory()
aset = afactory.create(ontology=go,
                       subject_category='gene',
                       object_category='function',
                       taxon='NCBITaxon:9606')

for n in wd_ontology.nodes():
    proteins = wd.canned_query('disease2protein', n)
    anns = [a for p in proteins for a in aset.annotations(p)]
    if len(anns) > 0:
        print("{} {}".format(n, wd_ontology.label(n)))
        for a in anns:
            outfile.write("{}\t{}\n".format(a, go.label(a)))

# Endpoints
SCIGRAPH_ONTOLOGY = 'https://scigraph-ontology-dev.monarchinitiative.org/scigraph/'
SCIGRAPH_DATA = 'https://scigraph-data-dev.monarchinitiative.org/scigraph/'
GOLR_URL = 'https://solr.monarchinitiative.org/solr/golr/select'
class GenericSimilarity(object):

    def __init__(self) -> None:
        self.associations = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def load_associations(self, taxon):
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                        ontology=self.ontology,
                        subject_category='gene',
                        object_category='phenotype',
                        taxon=taxon_map[taxon]
            )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> float:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, set()

        shared_terms = a1.intersection(a2)
        return len(shared_terms) / num_union, shared_terms

    def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie
                )
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        similarities.append({
                            'input_id': input_gene,
                            'input_symbol': igene['input_symbol'],
                            'hit_symbol': subject_label,
                            'hit_id': subject_curie,
                            'score': score,
                            'shared_terms': shared_terms,
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie):
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene

    @staticmethod
    def sort_results(input_gene_set, results):
        results = pd.DataFrame(results)
        annotated_gene_set = input_gene_set['hit_id'].tolist()
        results = \
            results[~results['hit_id'].isin(annotated_gene_set)]. \
            sort_values('score', ascending=False)
        return results
class GenericSimilarity(object):

    def __init__(self) -> None:
        self.associations = None
        self.ont = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def load_associations(self, taxon) -> None:
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. 
            # CX: These are 2 out of 3 top-level terms in GO ontology. 
            # CX: The excluded term is cellular_component (where gene carries out a molecular function)
            go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                        ontology=self.ontology,
                        subject_category='gene',
                        object_category='phenotype',
                        taxon=taxon_map[taxon]
            )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, list()

        shared_terms = a1.intersection(a2)

        # Note: we need to convert the shared_terms set to a list
        # to avoid later JSON serialization problems
        return len(shared_terms) / num_union, list(shared_terms)

    def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie
                )
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        # CX: addition of human-readable labels aka "shared_term_names" 
                        shared_term_names = [self.associations.label(x) for x in shared_terms]
                        similarities.append({
                            'input_id': input_gene,
                            'input_symbol': igene['input_symbol'],
                            'hit_symbol': subject_label,
                            'hit_id': subject_curie,
                            'score': score,
                            'shared_terms': shared_terms,
                            'shared_term_names': shared_term_names
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie) -> str:
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene

    @staticmethod
    def sort_results(results) -> pd.DataFrame:

        results = pd.DataFrame(results)

        if not results.empty:
            # CX: Some users need to know the scores that input genes have for each other.
            #     replacing code to remove GeneA input = GeneA output results
            results = \
                results[~(results.hit_id == results.input_id)]. \
                sort_values('score', ascending=False)

        return results
예제 #14
0
def test_remote_go_pombase():
    ont = OntologyFactory().create('go')
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt='gaf', file=POMBASE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 100
log = logging.getLogger(__name__)

from ontobio.ontol_factory import OntologyFactory
from ontobio.assoc_factory import AssociationSetFactory

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Sample script to open phenotypes')
    args = parser.parse_args()
    parser.add_argument('input', help='Input')

    ## Create an ontology factory in order to fetch HPO
    ofactory = OntologyFactory()
    ont = ofactory.create("hp")

    ## Create an association factory to get gene-phenotype associations
    afactory = AssociationSetFactory()
    ## Load Associations from Monarch. Note the first time this runs Jupyter will show '*' - be patient
    aset = afactory.create(ontology=ont,
                           subject_category='gene',
                           object_category='phenotype',
                           taxon='NCBITaxon:9606')

    ## Run enrichment tests using all classes in ontology
    enr = aset.enrichment_test(subjects=gene_ids,
                               threshold=0.00005,
                               labels=True)

    for r in enr[:20]:
        print("{:8.3g} {} {:40s}".format(r['p'], r['c'], str(r['n'])))
예제 #16
0
class GenericSimilarity(object):
    def __init__(self) -> None:
        self.associations = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def retrieve_associations(self, ont, group):
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(ont)
        p = GafParser()
        url = ''
        if ont == 'go':
            go_roots = set(
                self.ontology.descendants('GO:0008150') +
                self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if group == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if group == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse('goa_human.gaf.gz')
            #assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(
                assocs, ontology=sub_ont)
        else:
            self.associations = self.afactory.create(
                ontology=self.ontology,
                subject_category='gene',
                object_category='phenotype',
                taxon=taxon_map[group])

    def compute_jaccard(self,
                        input_genes: List[dict],
                        lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie)
                if input_gene is not subject_curie:
                    score = jaccard_similarity(self.associations, input_gene,
                                               subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        similarities.append({
                            'input_id':
                            input_gene,
                            'input_symbol':
                            igene['input_symbol'],
                            'hit_symbol':
                            subject_label,
                            'hit_id':
                            subject_curie,
                            'score':
                            score,
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie):
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene
예제 #17
0
class GenericSimilarity(object):
    # Class level singletons for similarity engines
    _ontology = {}

    # Class level cache for results of Jaccard similarity searches
    _jaccard_similarity_tasks = {}

    @classmethod
    def get_similarity_engine(cls, ontology, taxon):
        """
        Returns a singleton GenericSimilarity instance
        for use in Jaccard similarity computations

        :param ontology: should be 'go', 'hp' or 'mp'
        :param taxon: should be 'human' or 'mouse'
        :return: GenericSimilarity() singleton
        """
        if ontology not in ['go', 'hp', 'mp']:
            raise OntologyServerException(
                "compute_jaccard() ERROR: ontology '" + ontology +
                "' not recognized.")

        if taxon not in ['human', 'mouse']:
            raise OntologyServerException("compute_jaccard() ERROR: taxon '" +
                                          taxon + "' not recognized.")

        if ontology not in cls._ontology:
            cls._ontology[ontology] = {}

        if taxon not in cls._ontology[ontology]:
            cls._ontology[ontology][taxon] = GenericSimilarity(ontology, taxon)

        return cls._ontology[ontology][taxon]

    def __init__(self, ont: str, taxon: str) -> None:
        self.associations = None
        self.ont = ont
        self.taxon = taxon
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()
        self.load_associations()

    def load_associations(self) -> None:
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function.
            # CX: These are 2 out of 3 top-level terms in GO ontology.
            # CX: The excluded term is cellular_component (where gene carries out a molecular function)
            go_roots = set(
                self.ontology.descendants('GO:0008150') +
                self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if self.taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if self.taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(
                assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                    ontology=self.ontology,
                    subject_category='gene',
                    object_category='phenotype',
                    taxon=taxon_map[self.taxon]
                )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str,
                           s2: str) -> Tuple[float, list]:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, list()

        shared_terms = a1.intersection(a2)

        # Note: we need to convert the shared_terms set to a list
        # to avoid later JSON serialization problems
        return len(shared_terms) / num_union, list(shared_terms)

    async def compute_jaccard(self,
                              input_genes: List[dict],
                              lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene.sim_input_curie,
                    subject_curie=subject_curie)
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if score > lower_bound:
                        subject_label = self.associations.label(subject_curie)
                        # CX: addition of human-readable labels aka "shared_term_names"
                        shared_term_names = [
                            self.associations.label(x) for x in shared_terms
                        ]
                        similarities.append({
                            'input_id':
                            input_gene,
                            'input_symbol':
                            igene.input_symbol,
                            'hit_symbol':
                            subject_label,
                            'hit_id':
                            subject_curie,
                            'score':
                            score,
                            'shared_terms':
                            shared_terms,
                            'shared_term_names':
                            shared_term_names
                        })
        return similarities

    async def compute_jaccard_task(self, uuid: str, input_genes: List[dict],
                                   lower_bound: float):
        self._jaccard_similarity_tasks[uuid] = asyncio.create_task(
            self.compute_jaccard(input_genes, lower_bound))

    def compute_jaccard_async(self, input_genes: List[dict],
                              lower_bound: float):
        uuid = str(uuid4())
        asyncio.run(self.compute_jaccard_task(uuid, input_genes, lower_bound))
        return uuid

    @classmethod
    def get_jaccard_similarity_result(cls, computation_id: str):

        if computation_id in cls._jaccard_similarity_tasks:

            jaccard_similarity_task = cls._jaccard_similarity_tasks[
                computation_id]

            # Need to check if the result is ready to return, then return it
            if jaccard_similarity_task.done():

                try:
                    result = jaccard_similarity_task.result()

                except CancelledError:
                    raise JaccardSimilarityResultNotFound

                except InvalidStateError:
                    raise JaccardSimilarityComputationError

                return result

            else:
                raise JaccardSimilarityPending
        else:
            raise JaccardSimilarityResultNotFound

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie) -> str:
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene