Пример #1
0
    def _process_ontology_data(self):

        self.disease_ontology = OntologyClassReader()
        opentargets_ontologyutils.efo.load_open_targets_disease_ontology(
            self.disease_ontology, self.efo_uri)
        '''
        Get all phenotypes
        '''
        #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs
        disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris)

        utils = DiseaseUtils()
        disease_phenotypes = utils.get_disease_phenotypes(
            self.disease_ontology, self.hpo_uri, self.mp_uri,
            disease_phenotype_uris_counter)

        for uri, label in self.disease_ontology.current_classes.items():
            properties = self.disease_ontology.parse_properties(URIRef(uri))

            #create a text block definition/description by joining others together
            definition = ''
            if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties:
                definition = ". ".join(
                    properties['http://purl.obolibrary.org/obo/IAO_0000115'])

            synonyms = []
            if 'http://www.ebi.ac.uk/efo/alternative_term' in properties:
                synonyms = properties[
                    'http://www.ebi.ac.uk/efo/alternative_term']
            phenotypes = []
            if uri in disease_phenotypes:
                phenotypes = disease_phenotypes[uri]['phenotypes']

            therapeutic_labels = [
                item[0]
                for item in self.disease_ontology.classes_paths[uri]['labels']
            ]
            therapeutic_labels = self._remove_duplicates(therapeutic_labels)

            efo = EFO(
                code=uri,
                label=label,
                synonyms=synonyms,
                phenotypes=phenotypes,
                path=self.disease_ontology.classes_paths[uri]['all'],
                path_codes=self.disease_ontology.classes_paths[uri]['ids'],
                path_labels=self.disease_ontology.classes_paths[uri]['labels'],
                therapeutic_labels=therapeutic_labels,
                definition=definition)
            id = self.disease_ontology.classes_paths[uri]['ids'][0][-1]
            if uri in self.disease_ontology.children:
                efo.children = self.disease_ontology.children[uri]
            self.efos[id] = efo
Пример #2
0
    def _get_mp_classes(self, mp_uri):
        #self._logger.debug("_get_mp_classes")
        
        #load the onotology
        self.mp_ontology = OntologyClassReader()
        opentargets_ontologyutils.mp.load_mammalian_phenotype_ontology(self.mp_ontology, mp_uri)

        #TODO this is a moderately hideous bit of pointless munging, but I don't have time fix it now!

        for mp_id,label in list(self.mp_ontology.current_classes.items()):

            mp_class = {}
            mp_class["label"] = label
            if mp_id not in self.mp_ontology.classes_paths:
                self._logger.warning("cannot find paths for "+mp_id)
                continue
            mp_class["path"] = self.mp_ontology.classes_paths[mp_id]['all']
            mp_class["path_codes"] = self.mp_ontology.classes_paths[mp_id]['ids']

            mp_id_key = mp_id.split("/")[-1].replace(":", "_")
            self.mps[mp_id_key] = mp_class
            self.mp_labels[mp_class["label"]] = mp_id
            self.mp_to_label[mp_id] = mp_class["label"]
            paths = []
            for path in mp_class["path"]:
                item = path[0]
                paths.append(item)

            self.top_levels[mp_id] = paths
Пример #3
0
    def __init__(self, es_hosts, es_index, es_mappings, es_settings, eco_uri,
                 so_uri, workers_write, queue_write):
        self.es_hosts = es_hosts
        self.es_index = es_index
        self.es_mappings = es_mappings
        self.es_settings = es_settings
        self.eco_uri = eco_uri
        self.so_uri = so_uri
        self.workers_write = workers_write
        self.queue_write = queue_write

        self.ecos = OrderedDict()
        self.evidence_ontology = OntologyClassReader()
Пример #4
0
    def _process_ontology_data(self):

        self.disease_ontology = OntologyClassReader()
        opentargets_ontologyutils.efo.load_open_targets_disease_ontology(self.disease_ontology,  self.efo_uri)

        '''
        Get all phenotypes
        '''
        #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs
        disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris)

        utils = DiseaseUtils()
        disease_phenotypes = utils.get_disease_phenotypes(self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter)

        #for uri,label in self.disease_ontology.current_classes.items():
        for uri in self.disease_ontology.classes_paths:
            #get the short code form of the uri
            classes_path = self.disease_ontology.classes_paths[uri]
            id = classes_path['ids'][0][-1]
            label = classes_path['labels'][0][-1]
            if uri != classes_path["all"][0][-1]["uri"]:
                raise RuntimeError('mismatch between uri and classes_path["all"][0][-1]["uri"] %s %s' % (uri, classes_path["all"][0][-1]["uri"]))


            properties = self.disease_ontology.parse_properties(URIRef(uri))

            #create a text block definition/description by joining others together
            definition = ''
            if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties:
                definition = ". ".join(properties['http://purl.obolibrary.org/obo/IAO_0000115'])

            #build a set of all the relevant synonyms
            synonyms = set()            
            #exact synonyms
            if 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasExactSynonym'])

            #related synonyms (partially overlapping)
            if 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym'])

            #generic synoynms
            if 'http://www.geneontology.org/formats/oboInOwl#hasSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasSynonym'])

            #narrow synonyms
            if 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym'])

            #could have http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym, but that is better captured by parent term

            phenotypes = []
            if uri in disease_phenotypes:
                phenotypes = disease_phenotypes[uri]['phenotypes']

            if uri not in self.disease_ontology.classes_paths:
                self.logger.warning("Unable to find %s", uri)
                continue


            therapeutic_labels = self.disease_ontology.therapeutic_labels[uri]
            therapeutic_uris = self.disease_ontology.therapeutic_uris[uri]
            therapeutic_codes = [self.disease_ontology.classes_paths[ta_uri]['ids'][0][-1] for ta_uri in therapeutic_uris]


            efo = EFO(code=uri,
                      label=label,
                      synonyms=synonyms,
                      phenotypes=phenotypes,
                      path=classes_path['all'],
                      path_codes=classes_path['ids'],
                      path_labels=classes_path['labels'],
                      therapeutic_labels=therapeutic_labels,
                      therapeutic_codes=therapeutic_codes,
                      definition=definition
                      )

            if uri in self.disease_ontology.children:
                efo.children = self.disease_ontology.children[uri]

            #logger.debug(str(classes_path['ids']))
            self.logger.debug("done %s %s %s", id, uri, label)

            if id in self.efos:
                self.logger.warning("duplicate %s", id)
                continue
            self.efos[id] = efo
Пример #5
0
class EfoProcess():

    def __init__(self, es_hosts, es_index, es_doc, es_mappings, es_settings,
                 efo_uri, hpo_uri, mp_uri,
                 disease_phenotype_uris,
                 workers_write, queue_write
                 ):
        self.es_hosts = es_hosts
        self.es_index = es_index
        self.es_doc = es_doc
        self.es_mappings = es_mappings
        self.es_settings = es_settings
        self.efo_uri = efo_uri
        self.hpo_uri = hpo_uri
        self.mp_uri = mp_uri
        self.disease_phenotype_uris = disease_phenotype_uris
        self.workers_write = workers_write
        self.queue_write = queue_write

        self.efos = OrderedDict()
        self.logger = logging.getLogger(__name__+".EfoProcess")

    def process_all(self, dry_run):
        self._process_ontology_data()
        self._store_efo(dry_run)

    def _process_ontology_data(self):

        self.disease_ontology = OntologyClassReader()
        opentargets_ontologyutils.efo.load_open_targets_disease_ontology(self.disease_ontology,  self.efo_uri)

        '''
        Get all phenotypes
        '''
        #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs
        disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris)

        utils = DiseaseUtils()
        disease_phenotypes = utils.get_disease_phenotypes(self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter)

        #for uri,label in self.disease_ontology.current_classes.items():
        for uri in self.disease_ontology.classes_paths:
            #get the short code form of the uri
            classes_path = self.disease_ontology.classes_paths[uri]
            id = classes_path['ids'][0][-1]
            label = classes_path['labels'][0][-1]
            if uri != classes_path["all"][0][-1]["uri"]:
                raise RuntimeError('mismatch between uri and classes_path["all"][0][-1]["uri"] %s %s' % (uri, classes_path["all"][0][-1]["uri"]))


            properties = self.disease_ontology.parse_properties(URIRef(uri))

            #create a text block definition/description by joining others together
            definition = ''
            if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties:
                definition = ". ".join(properties['http://purl.obolibrary.org/obo/IAO_0000115'])

            #build a set of all the relevant synonyms
            synonyms = set()            
            #exact synonyms
            if 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasExactSynonym'])

            #related synonyms (partially overlapping)
            if 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym'])

            #generic synoynms
            if 'http://www.geneontology.org/formats/oboInOwl#hasSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasSynonym'])

            #narrow synonyms
            if 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' in properties:
                synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym'])

            #could have http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym, but that is better captured by parent term

            phenotypes = []
            if uri in disease_phenotypes:
                phenotypes = disease_phenotypes[uri]['phenotypes']

            if uri not in self.disease_ontology.classes_paths:
                self.logger.warning("Unable to find %s", uri)
                continue


            therapeutic_labels = self.disease_ontology.therapeutic_labels[uri]
            therapeutic_uris = self.disease_ontology.therapeutic_uris[uri]
            therapeutic_codes = [self.disease_ontology.classes_paths[ta_uri]['ids'][0][-1] for ta_uri in therapeutic_uris]


            efo = EFO(code=uri,
                      label=label,
                      synonyms=synonyms,
                      phenotypes=phenotypes,
                      path=classes_path['all'],
                      path_codes=classes_path['ids'],
                      path_labels=classes_path['labels'],
                      therapeutic_labels=therapeutic_labels,
                      therapeutic_codes=therapeutic_codes,
                      definition=definition
                      )

            if uri in self.disease_ontology.children:
                efo.children = self.disease_ontology.children[uri]

            #logger.debug(str(classes_path['ids']))
            self.logger.debug("done %s %s %s", id, uri, label)

            if id in self.efos:
                self.logger.warning("duplicate %s", id)
                continue
            self.efos[id] = efo

    def _store_efo(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

        
    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """
    def qc(self, es, index):
        self.logger.info("Starting QC")
        #number of EFO terms
        efo_term_count = 0

        #top level terms (i.e. categories)
        efo_top_levels = []

        #terms without a description
        efo_missing_description_count = 0

        #loop over all efo terms and calculate the metrics
        #Note: try to avoid doing this more than once!
        for efo_term in Search().using(es).index(index).query(MatchAll()).scan():
            efo_term_count += 1

            #path_labels is a list of lists of all paths to the root
            #top level terms will be those with one list of one item that is itself
            if len(efo_term["path_labels"]) == 1:
                if len(efo_term["path_labels"][0]) == 1:
                    efo_top_levels.append(efo_term["label"])

            if efo_term["definition"] == None or len(efo_term["definition"].strip()) == 0:
                efo_missing_description_count += 1

        #put the metrics into a single dict
        metrics = dict()
        metrics["efo.count"] = efo_term_count
        metrics["efo.top"] = sorted(efo_top_levels)
        metrics["efo.top.count"] = len(efo_top_levels)
        metrics["efo.missing_description.count"] = efo_missing_description_count

        #return the metrics to the caller so they can write to file or further compare
        self.logger.info("Finished QC")
        return metrics
Пример #6
0
class EfoProcess():
    def __init__(self, loader, efo_uri, hpo_uri, mp_uri,
                 disease_phenotype_uris):
        self.loader = loader
        self.efos = OrderedDict()
        self.logger = logging.getLogger(__name__ + ".EfoProcess")
        self.efo_uri = efo_uri
        self.hpo_uri = hpo_uri
        self.mp_uri = mp_uri
        self.disease_phenotype_uris = disease_phenotype_uris

    def process_all(self, dry_run):
        self._process_ontology_data()
        self._store_efo(dry_run)

    def _process_ontology_data(self):

        self.disease_ontology = OntologyClassReader()
        opentargets_ontologyutils.efo.load_open_targets_disease_ontology(
            self.disease_ontology, self.efo_uri)
        '''
        Get all phenotypes
        '''
        #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs
        disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris)

        utils = DiseaseUtils()
        disease_phenotypes = utils.get_disease_phenotypes(
            self.disease_ontology, self.hpo_uri, self.mp_uri,
            disease_phenotype_uris_counter)

        for uri, label in self.disease_ontology.current_classes.items():
            properties = self.disease_ontology.parse_properties(URIRef(uri))

            #create a text block definition/description by joining others together
            definition = ''
            if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties:
                definition = ". ".join(
                    properties['http://purl.obolibrary.org/obo/IAO_0000115'])

            synonyms = []
            if 'http://www.ebi.ac.uk/efo/alternative_term' in properties:
                synonyms = properties[
                    'http://www.ebi.ac.uk/efo/alternative_term']
            phenotypes = []
            if uri in disease_phenotypes:
                phenotypes = disease_phenotypes[uri]['phenotypes']

            therapeutic_labels = [
                item[0]
                for item in self.disease_ontology.classes_paths[uri]['labels']
            ]
            therapeutic_labels = self._remove_duplicates(therapeutic_labels)

            efo = EFO(
                code=uri,
                label=label,
                synonyms=synonyms,
                phenotypes=phenotypes,
                path=self.disease_ontology.classes_paths[uri]['all'],
                path_codes=self.disease_ontology.classes_paths[uri]['ids'],
                path_labels=self.disease_ontology.classes_paths[uri]['labels'],
                therapeutic_labels=therapeutic_labels,
                definition=definition)
            id = self.disease_ontology.classes_paths[uri]['ids'][0][-1]
            if uri in self.disease_ontology.children:
                efo.children = self.disease_ontology.children[uri]
            self.efos[id] = efo

    def _remove_duplicates(self, xs):

        newlist = []

        for item in xs:
            if item not in newlist:
                newlist.append(item)
        return newlist

    def _store_efo(self, dry_run):

        #setup elasticsearch
        if not dry_run:
            self.loader.create_new_index(
                Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME)
            #need to directly get the versioned index name for this function
            self.loader.prepare_for_bulk_indexing(
                self.loader.get_versioned_index(
                    Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME))

        for efo_id, efo_obj in self.efos.items():
            if not dry_run:
                self.loader.put(
                    index_name=Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME,
                    doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME,
                    ID=efo_id,
                    body=efo_obj)

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(
                Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()

    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """

    def qc(self, esquery):
        self.logger.info("Starting QC")
        #number of EFO terms
        efo_term_count = 0

        #top level terms (i.e. categories)
        efo_top_levels = []

        #terms without a description
        efo_missing_description_count = 0

        #loop over all efo terms and calculate the metrics
        #Note: try to avoid doing this more than once!
        for efo_term in esquery.get_all_diseases():
            efo_term_count += 1

            #path_labels is a list of lists of all paths to the root
            #top level terms will be those with one list of one item that is itself
            if len(efo_term["path_labels"]) == 1:
                if len(efo_term["path_labels"][0]) == 1:
                    efo_top_levels.append(efo_term["label"])

            if efo_term["definition"] == None or len(
                    efo_term["definition"].strip()) == 0:
                efo_missing_description_count += 1

        #put the metrics into a single dict
        metrics = dict()
        metrics["efo.count"] = efo_term_count
        metrics["efo.top"] = sorted(efo_top_levels)
        metrics["efo.top.count"] = len(efo_top_levels)
        metrics[
            "efo.missing_description.count"] = efo_missing_description_count

        #return the metrics to the caller so they can write to file or further compare
        self.logger.info("Finished QC")
        return metrics
Пример #7
0
 def __init__(self, loader, eco_uri, so_uri):
     self.loader = loader
     self.ecos = OrderedDict()
     self.evidence_ontology = OntologyClassReader()
     self.eco_uri = eco_uri
     self.so_uri = so_uri