Пример #1
0
class QCRunner(object):
    def __init__(self, es):

        self.es = es
        self.esquery = ESQuery(es)
        self._logger = logging.getLogger(__name__ + ".QCRunner")

    def run_associationQC(self):
        self.run_evidence2associationQC()

    def run_evidence2associationQC(self):
        computed_assocations_ids = set(self.esquery.get_all_associations_ids())
        missing_assocations_ids = set()
        total_evidence = self.esquery.count_elements_in_index(
            Const.ELASTICSEARCH_DATA_INDEX_NAME + '*')
        self._logger.info('Starting to analyse %i evidence' % total_evidence)
        for as_id in self.esquery.get_all_target_disease_pair_from_evidence():
            if as_id not in computed_assocations_ids:
                self._logger.error(
                    'Association id %s was not computed or stored' % as_id)
                missing_assocations_ids.add(as_id)

        if missing_assocations_ids:
            self._logger.error('%i associations not found' %
                               len(missing_assocations_ids))
            self._logger.error('\n'.join(list(missing_assocations_ids)))
        else:
            self._logger.info('no missing annotation found')
Пример #2
0
    def merge_data(self, genes, loader, r_server, data_config):

        esquery = ESQuery(loader.es)

        try:
            count = esquery.count_elements_in_index(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
        except NotFoundError as ex:
            self._logger.error(
                'no Ensembl index in ES. Skipping. Has the --ensembl step been run? Are you pointing to the correct index? %s'
                % ex)
            raise ex

        for row in esquery.get_all_ensembl_genes():
            if row['id'] in genes:
                gene = genes.get_gene(row['id'])
                gene.load_ensembl_data(row)
                genes.add_gene(gene)
            else:
                gene = Gene()
                gene.load_ensembl_data(row)
                genes.add_gene(gene)

        self._clean_non_reference_genes(genes)

        self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
 def __init__(self, loader, r_server):
     self.loader = loader
     self.esquery = ESQuery(loader.es)
     self.r_server = r_server
     self.logger = logging.getLogger(__name__)
     '''define data processing handlers'''
     self.data_handlers = defaultdict(lambda: SearchObject)
     self.data_handlers[SearchObjectTypes.TARGET] = SearchObjectTarget
     self.data_handlers[SearchObjectTypes.DISEASE] = SearchObjectDisease
Пример #4
0
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)
Пример #5
0
 def __init__(self,
              es=None,
              namespace=None,
              r_server=None,
              ttl = 60*60*24+7):
     self._es = es
     self.r_server = r_server
     self._es_query = ESQuery(self._es)
     self._table = RedisLookupTablePickle(namespace = namespace,
                                         r_server = self.r_server,
                                         ttl = ttl)
     self._logger = logging.getLogger(__name__)
     if self.r_server is not None:
         self._load_efo_data(r_server)
Пример #6
0
 def __init__(self,
              es=None,
              namespace = None,
              r_server = None,
              ttl = 60*60*24+7,
              targets = [],
              autoload=True):
     self._logger = logging.getLogger(__name__)
     self._es = es
     self.r_server = r_server
     self._es_query = ESQuery(self._es)
     self._table = RedisLookupTablePickle(namespace = namespace,
                                         r_server = self.r_server,
                                         ttl = ttl)
     self._logger = logging.getLogger(__name__)
     self.uniprot2ensembl = {}
     if self.r_server and autoload:
         self.load_gene_data(self.r_server, targets)
Пример #7
0
class HPALookUpTable(object):
    """
    A redis-based pickable hpa look up table using gene id as table
    id
    """

    def __init__(self,
                 es=None,
                 namespace=None,
                 r_server=None,
                 ttl=(60 * 60 * 24 + 7)):
        self._es = es
        self.r_server = r_server
        self._es_query = ESQuery(self._es)
        self._table = RedisLookupTablePickle(namespace=namespace,
                                             r_server=self.r_server,
                                             ttl=ttl)
        self._logger = logging.getLogger(__name__)

        if self.r_server:
            self._load_hpa_data(self.r_server)

    def _load_hpa_data(self, r_server=None):
        for el in self._es_query.get_all_hpa():
            self.set_hpa(el, r_server=self._get_r_server(r_server))

    def get_hpa(self, idx, r_server=None):
        return self._table.get(idx, r_server=self._get_r_server(r_server))

    def set_hpa(self, hpa, r_server=None):
        self._table.set(hpa['gene'], hpa,
                        r_server=self._get_r_server(r_server))

    def get_available_hpa_ids(self, r_server=None):
        return self._table.keys(self._get_r_server(r_server))

    def __contains__(self, key, r_server=None):
        return self._table.__contains__(key,
                                        r_server=self._get_r_server(r_server))

    def __getitem__(self, key, r_server=None):
        return self.get_hpa(key, r_server=self._get_r_server(r_server))

    def __setitem__(self, key, value, r_server=None):
        self._table.set(key, value, r_server=self._get_r_server(r_server))

    def keys(self, r_server=None):
        return self._table.keys(self._get_r_server(r_server))

    def _get_r_server(self, r_server=None):
        return r_server if r_server else self.r_server
Пример #8
0
class ReactomeRetriever():
    """
    Will retrieve a Reactome object form the processed json stored in elasticsearch
    """
    def __init__(self, es):
        self.es_query = ESQuery(es)
        self._cache = {}
        self.logger = logging.getLogger(__name__)

    def get_reaction(self, reaction_id):
        if reaction_id not in self._cache:
            reaction = ReactomeNode()
            reaction.load_json(self.es_query.get_reaction(reaction_id))
            self._cache[reaction_id] = reaction
        return self._cache[reaction_id]
Пример #9
0
    def __init__(self, es):

        self.es = es
        self.esquery = ESQuery(es)
        self._logger = logging.getLogger(__name__ + ".QCRunner")
Пример #10
0
class ECOLookUpTable(object):
    """
    A redis-based pickable gene look up table
    """


    def __init__(self,
                 es,
                 namespace=None,
                 r_server=None,
                 ttl=60 * 60 * 24 + 7):
        self._table = RedisLookupTablePickle(namespace=namespace,
                                             r_server=r_server,
                                             ttl=ttl)
        self._es = es
        self._es_query = ESQuery(es)
        self.r_server = r_server
        self._logger = logging.getLogger(__name__)
        if r_server is not None:
            self._load_eco_data(r_server)

    @staticmethod
    def get_ontology_code_from_url(url):
        #note, this is not a guaranteed solution
        #to do it properly, it has to be from the actual
        #ontology file or from OLS API
        if '/' in url:
            return url.split('/')[-1]
        else:
            #assume already a short code
            return url

    def _load_eco_data(self, r_server=None):
        self._logger = logging.getLogger(__name__)
        for eco in self._es_query.get_all_eco():
            self._table.set(self.get_ontology_code_from_url(eco['code']), eco,
                            r_server=self._get_r_server(r_server))  # TODO can be improved by sending elements in batches


    def get_eco(self, efo_id, r_server=None):
        return self._table.get(efo_id, r_server=self._get_r_server(r_server))


    def set_eco(self, eco, r_server=None):
        self._table.set(self.get_ontology_code_from_url(eco['code']), eco, r_server=self._get_r_server(r_server))


    def get_available_eco_ids(self, r_server=None):
        return self._table.keys(r_server=self._get_r_server(r_server))


    def __contains__(self, key, r_server=None):
        return self._table.__contains__(key, r_server=self._get_r_server(r_server))


    def __getitem__(self, key, r_server=None):
        return self.get_eco(key, r_server=self._get_r_server(r_server))


    def __setitem__(self, key, value, r_server=None):
        self._table.set(key, value, r_server=self._get_r_server(r_server))


    def _get_r_server(self, r_server=None):
        return r_server if r_server else self.r_server


    def keys(self, r_server=None):
        return self._table.keys(r_server=self._get_r_server(r_server))
Пример #11
0
 def __init__(self, es, filename, datasources_to_datatypes):
     self.logger = logging.getLogger(__name__)
     self.esquery = ESQuery(es)
     self.filename = filename
     self.datasources_to_datatypes = datasources_to_datatypes
Пример #12
0
def produce_evidence_local_init(es_hosts, scoring_weights,
                                is_direct_do_not_propagate,
                                datasources_to_datatypes):
    es = new_es_client(es_hosts)
    es_query = ESQuery(es)
    return es_query, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes
Пример #13
0
class ScoringProcess():
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)

    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")

    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """

    def qc(self, esquery):

        #number of eco entries
        association_count = 0
        #Note: try to avoid doing this more than once!
        for association in esquery.get_all_associations():
            association_count += 1
            if association_count % 1000 == 0:
                self.logger.debug("checking %d", association_count)

        #put the metrics into a single dict
        metrics = dict()
        metrics["association.count"] = association_count

        return metrics
Пример #14
0
class Metrics:
    def __init__(self, es, filename, datasources_to_datatypes):
        self.logger = logging.getLogger(__name__)
        self.esquery = ESQuery(es)
        self.filename = filename
        self.datasources_to_datatypes = datasources_to_datatypes

    def generate_metrics(self):
        self.logger.info("Producing data release metrics")

        count_drug_w_evidence = self.esquery.count_drug_w_evidence()
        count_entity_w_association = self.esquery.count_entity_w_association()
        count_target_w_symbol = self.esquery.count_target_w_symbol()
        count_target_w_mp = self.esquery.count_target_w_mp()
        count_target_w_hallmark = self.esquery.count_target_w_hallmark()
        count_target_w_biomarker = self.esquery.count_target_w_biomarker()
        count_BRAF_evidence = self.esquery.count_BRAF_evidence()
        count_withdrawn_drug_evidence = self.esquery.count_withdrawn_drug_evidence(
        )
        count_trinucleotide_evidence = self.esquery.count_trinucleotide_evidence(
        )

        count_datatype_evidence = self.esquery.count_datatype_evidence()
        count_datatype_association = self.esquery.count_datatype_association()

        with open(self.filename, 'w') as metrics_output:
            metrics_output.write(
                "drugs(unique) with evidence:\t" +
                str(count_drug_w_evidence['aggregations']['general_drug']
                    ['value']) + "\n" +
                "diseases(unique) with association:\t" +
                str(count_entity_w_association['aggregations']
                    ['general_disease']['value']) + "\n" +
                "targets(unique) with association:\t" +
                str(count_entity_w_association['aggregations']
                    ['general_target']['value']) + "\n" +
                "targets with approved symbol:\t" +
                str(count_target_w_symbol['hits']['total']) + "\n" +
                "targets with mouse phenotype:\t" +
                str(count_target_w_mp['hits']['total']) + "\n" +
                "targets with cancer hallmark:\t" +
                str(count_target_w_hallmark['hits']['total']) + "\n" +
                "targets with cancer biomarker:\t" +
                str(count_target_w_biomarker['hits']['total']) + "\n" +
                "evidence link to BRAF:\t" +
                str(count_BRAF_evidence['hits']['total']) + "\n" +
                "evidence link to withdrawn drug:\t" +
                str(count_withdrawn_drug_evidence['hits']['total']) + "\n"
                "evidence link to trinucleotide expansion:\t" +
                str(count_trinucleotide_evidence['hits']['total']) + "\n")

            for ds in self.datasources_to_datatypes.iterkeys():
                count_datasource_evidence = self.esquery.count_datasource_evidence(
                    ds)
                metrics_output.write(
                    "evidence from datasource " + ds + ":\t" +
                    str(count_datasource_evidence['hits']['total']) + "\n")

            for item in count_datatype_evidence['aggregations']['datatypes'][
                    'buckets']:
                datatype = item['key']
                evidence_count = item['doc_count']
                metrics_output.write("evidence from datatype " + datatype +
                                     ":\t" + str(evidence_count) + "\n")

            for item in count_datatype_association['aggregations'][
                    'datatypes']['buckets']:
                datatype = item['key']
                association_count = item['doc_count']
                metrics_output.write("association from datatype " + datatype +
                                     ":\t" + str(association_count) + "\n")

        metrics_output.close()
        self.logger.info("Producing data release metrics - Completed")
class DataDrivenRelationProcess(object):

    def __init__(self, es):
        self.es = es
        self.es_query=ESQuery(self.es)
        self.logger = logging.getLogger(__name__)

    def process_all(self, dry_run, 
            ddr_workers_production,
            ddr_workers_score,
            ddr_queue_production_score,
            ddr_queue_score_result):
        start_time = time.time()

        target_data, disease_data = self.es_query.get_disease_to_targets_vectors()

        self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time))
        self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.))
        self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.))

        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = self.es_query.get_disease_labels(disease_keys)
        disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys]
        self.logger.info('getting target labels')
        target_id_to_label = self.es_query.get_target_labels(target_keys)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        #setup elasticsearch
        self.loader = Loader(self.es, dry_run=dry_run)
        if not dry_run:
            #need to directly get the versioned index name for this function
            self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME))


        #calculate and store disease-to-disease in multiple processess
        self.logger.info('handling disease-to-disease')
        handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, 
            target_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled disease-to-disease')

        #calculate and store target-to-target in multiple processess
        self.logger.info('handling target-to-target')
        handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, 
            disease_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled target-to-target')

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()
Пример #16
0
class GeneLookUpTable(object):
    """
    A redis-based pickable gene look up table
    """

    def __init__(self,
                 es=None,
                 namespace = None,
                 r_server = None,
                 ttl = 60*60*24+7,
                 targets = [],
                 autoload=True):
        self._logger = logging.getLogger(__name__)
        self._es = es
        self.r_server = r_server
        self._es_query = ESQuery(self._es)
        self._table = RedisLookupTablePickle(namespace = namespace,
                                            r_server = self.r_server,
                                            ttl = ttl)
        self._logger = logging.getLogger(__name__)
        self.uniprot2ensembl = {}
        if self.r_server and autoload:
            self.load_gene_data(self.r_server, targets)

    def load_gene_data(self, r_server = None, targets = []):
        data = None
        if targets:
            data = self._es_query.get_targets_by_id(targets)
            total = len(targets)
        if data is None:
            data = self._es_query.get_all_targets()
            total = self._es_query.count_all_targets()
        for target in data:
            self._table.set(target['id'],target, r_server=self._get_r_server(r_server))#TODO can be improved by sending elements in batches
            if target['uniprot_id']:
                self.uniprot2ensembl[target['uniprot_id']] = target['id']
            for accession in target['uniprot_accessions']:
                self.uniprot2ensembl[accession] = target['id']


    def get_gene(self, target_id, r_server = None):
        try:
            return self._table.get(target_id, r_server=self._get_r_server(r_server))
        except KeyError:
            try:
                target = self._es_query.get_objects_by_id(target_id,
                                                          Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME,
                                                          Const.ELASTICSEARCH_GENE_NAME_DOC_NAME,
                                                          source_exclude='ortholog.*'
                                                          ).next()
            except Exception as e:
                self._logger.exception('Cannot retrieve target from elasticsearch')
                raise KeyError()
            self.set_gene(target, r_server)
            return target

    def set_gene(self, target, r_server = None):
        self._table.set(target['id'],target, r_server=self._get_r_server(r_server))

    def get_available_gene_ids(self, r_server = None):
        return self._table.keys(r_server = self._get_r_server(r_server))

    def __contains__(self, key, r_server=None):
        redis_contain = self._table.__contains__(key, r_server=self._get_r_server(r_server))
        if redis_contain:
            return True
        if not redis_contain:
            return self._es_query.exists(index=Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME,
                                         doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME,
                                         id=key,
                                         )

    def __getitem__(self, key, r_server = None):
        return self.get_gene(key, self._get_r_server(r_server))

    def __setitem__(self, key, value, r_server=None):
        self._table.set(key, value, self._get_r_server(r_server))

    def __missing__(self, key):
        print key

    def keys(self, r_server=None):
        return self._table.keys(self._get_r_server(r_server))

    def _get_r_server(self, r_server = None):
        return r_server if r_server else self.r_server
 def __init__(self, es):
     self.es = es
     self.es_query=ESQuery(self.es)
     self.logger = logging.getLogger(__name__)
Пример #18
0
class EFOLookUpTable(object):
    """
    A redis-based pickable efo look up table.
    Allows to grab the EFO saved in ES and load it up in memory/redis so that it can be accessed quickly from multiple processes, reducing memory usage by sharing.
    """

    def __init__(self,
                 es=None,
                 namespace=None,
                 r_server=None,
                 ttl = 60*60*24+7):
        self._es = es
        self.r_server = r_server
        self._es_query = ESQuery(self._es)
        self._table = RedisLookupTablePickle(namespace = namespace,
                                            r_server = self.r_server,
                                            ttl = ttl)
        self._logger = logging.getLogger(__name__)
        if self.r_server is not None:
            self._load_efo_data(r_server)

    @staticmethod
    def get_ontology_code_from_url(url):
        #note, this is not a guaranteed solution
        #to do it properly, it has to be from the actual
        #ontology file or from OLS API
        if '/' in url:
            return url.split('/')[-1]
        else:
            #assume already a short code
            return url

    def _load_efo_data(self, r_server = None):
        self._logger = logging.getLogger(__name__)
        for i,efo in enumerate(self._es_query.get_all_diseases()):
            #TODO can be improved by sending elements in batches
            self.set_efo(efo, r_server=self._get_r_server(r_server))
            if i % 1000 == 0:
                self._logger.debug("Loaded %s efo", i)                

    def get_efo(self, efo_id, r_server=None):
        return self._table.get(efo_id, r_server=self._get_r_server(r_server))

    def set_efo(self, efo, r_server=None):
        efo_key = efo['path_codes'][0][-1]
        self._table.set(efo_key,efo, r_server=self._get_r_server(r_server))

    def get_available_gefo_ids(self, r_server=None):
        return self._table.keys(r_server=self._get_r_server(r_server))

    def __contains__(self, key, r_server=None):
        return self._table.__contains__(key, r_server=self._get_r_server(r_server))

    def __getitem__(self, key, r_server=None):
        return self.get_efo(key, r_server=self._get_r_server(r_server))

    def __setitem__(self, key, value, r_server=None):
        self._table.set(key, value, r_server=self._get_r_server(r_server))

    def keys(self, r_server=None):
        return self._table.keys(r_server=self._get_r_server(r_server))

    def _get_r_server(self, r_server = None):
        return r_server if r_server else self.r_server
Пример #19
0
 def __init__(self, es):
     self.es_query = ESQuery(es)
     self._cache = {}
     self.logger = logging.getLogger(__name__)
Пример #20
0
def main():
    #parse config file, environment, and command line arguments
    mrtarget.cfg.setup_ops_parser()
    args = mrtarget.cfg.get_ops_args()

    #set up logging
    logger = None
    if args.log_config:
        if os.path.isfile(args.log_config) and os.access(
                args.log_config, os.R_OK):
            #read a log configuration file
            logging.config.fileConfig(args.log_config,
                                      disable_existing_loggers=False)
            logger = logging.getLogger(__name__ + ".main()")
        else:
            #unable to read the logging config file, abort
            logging.basicConfig()
            logger = logging.getLogger(__name__ + ".main()")
            logger.error("unable to read file {}".format(args.log_config))
            return 1
    else:
        #no logging config specified, fall back to default
        logging.basicConfig()
        logger = logging.getLogger(__name__ + ".main()")

    if not args.release_tag:
        logger.error('A [release-tag] has to be specified.')
        print('A [release-tag] has to be specified.', file=sys.stderr)
        return 1
    else:
        Config.RELEASE_VERSION = args.release_tag
        logger.info('setting release version %s' % Config.RELEASE_VERSION)

    with RedisManager(args.redis_remote, args.redis_host, args.redis_port):

        es = new_es_client(args.elasticseach_nodes)
        redis = new_redis_client(args.redis_host, args.redis_port)

        #create a single query object for future use
        esquery = ESQuery(es)

        #read the data configuration
        data_config = mrtarget.cfg.get_data_config(args.data_config)

        #create something to accumulate qc metrics into over various steps
        qc_metrics = QCMetrics()

        with Loader(es,
                    chunk_size=ElasticSearchConfiguration.bulk_load_chunk,
                    dry_run=args.dry_run) as loader:

            if args.rea:
                process = ReactomeProcess(
                    loader, data_config.reactome_pathway_data,
                    data_config.reactome_pathway_relation)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.ens:
                process = EnsemblProcess(loader)
                if not args.qc_only:
                    process.process(data_config.ensembl_filename, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.unic:
                process = UniprotDownloader(loader)
                if not args.qc_only:
                    process.process(data_config.uniprot_uri, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.hpa:
                process = HPAProcess(loader, redis, args.elasticseach_nodes,
                                     data_config.tissue_translation_map,
                                     data_config.tissue_curation_map,
                                     data_config.hpa_normal_tissue,
                                     data_config.hpa_rna_level,
                                     data_config.hpa_rna_value,
                                     data_config.hpa_rna_zscore)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.gen:
                process = GeneManager(
                    loader,
                    redis,
                    args.gen_plugin_places,
                    data_config.gene_data_plugin_names,
                )
                if not args.qc_only:
                    process.merge_all(data_config, dry_run=args.dry_run)

                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.efo:
                process = EfoProcess(loader, data_config.ontology_efo,
                                     data_config.ontology_hpo,
                                     data_config.ontology_mp,
                                     data_config.disease_phenotype)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.eco:
                process = EcoProcess(loader, data_config.ontology_eco,
                                     data_config.ontology_so)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.val:
                es_output_folder = None
                if "elasticsearch_folder" in vars(
                        args) and args.elasticsearch_folder is not None:
                    es_output_folder = args.elasticsearch_folder

                process_evidences_pipeline(
                    filenames=data_config.input_file,
                    first_n=args.val_first_n,
                    es_client=es,
                    redis_client=redis,
                    dry_run=args.dry_run,
                    output_folder=es_output_folder,
                    num_workers=args.val_workers_validator,
                    num_writers=args.val_workers_writer,
                    max_queued_events=args.val_queue_validator_writer,
                    eco_scores_uri=data_config.eco_scores,
                    schema_uri=data_config.schema,
                    es_hosts=args.elasticseach_nodes,
                    excluded_biotypes=data_config.excluded_biotypes,
                    datasources_to_datatypes=data_config.
                    datasources_to_datatypes)

                #TODO qc

            if args.assoc:
                process = ScoringProcess(args.redis_host, args.redis_port,
                                         args.elasticseach_nodes)
                if not args.qc_only:
                    process.process_all(data_config.scoring_weights,
                                        data_config.is_direct_do_not_propagate,
                                        data_config.datasources_to_datatypes,
                                        args.dry_run,
                                        args.as_workers_production,
                                        args.as_workers_score,
                                        args.as_queue_production_score)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
                    pass

            if args.ddr:
                process = DataDrivenRelationProcess(es)
                if not args.qc_only:
                    process.process_all(args.dry_run,
                                        args.ddr_workers_production,
                                        args.ddr_workers_score,
                                        args.ddr_queue_production_score,
                                        args.ddr_queue_score_result)
                #TODO qc

            if args.sea:
                process = SearchObjectProcess(loader, redis)
                if not args.qc_only:
                    process.process_all(
                        data_config.chembl_target,
                        data_config.chembl_mechanism,
                        data_config.chembl_component,
                        data_config.chembl_protein,
                        data_config.chembl_molecule_set_uri_pattern,
                        args.dry_run)
                #TODO qc

            if args.metric:
                process = Metrics(
                    es, args.metric_file,
                    data_config.datasources_to_datatypes).generate_metrics()

    if args.qc_in:
        #handle reading in previous qc from filename provided, and adding comparitive metrics
        qc_metrics.compare_with(args.qc_in)

    if args.qc_out:
        #handle writing out to a tsv file
        qc_metrics.write_out(args.qc_out)

    logger.info('`' + " ".join(sys.argv) + '` - finished')
    return 0
Пример #21
0
class SearchObjectProcess(object):
    def __init__(self, loader, r_server):
        self.loader = loader
        self.esquery = ESQuery(loader.es)
        self.r_server = r_server
        self.logger = logging.getLogger(__name__)
        '''define data processing handlers'''
        self.data_handlers = defaultdict(lambda: SearchObject)
        self.data_handlers[SearchObjectTypes.TARGET] = SearchObjectTarget
        self.data_handlers[SearchObjectTypes.DISEASE] = SearchObjectDisease

    def process_all(self, chembl_target_uri, chembl_mechanism_uri,
                    chembl_component_uri, chembl_protein_uri,
                    chembl_molecule_set_uri_pattern, dry_run):
        ''' process all the objects that needs to be returned by the search method
        :return:
        '''

        #setup chembl handler
        self.chembl_handler = ChEMBLLookup(chembl_target_uri,
                                           chembl_mechanism_uri,
                                           chembl_component_uri,
                                           chembl_protein_uri,
                                           chembl_molecule_set_uri_pattern)
        self.chembl_handler.get_molecules_from_evidence(self.esquery)
        all_molecules = set()
        for target, molecules in self.chembl_handler.target2molecule.items():
            all_molecules = all_molecules | molecules
        all_molecules = sorted(all_molecules)
        query_batch_size = 100
        for i in range(0, len(all_molecules) + 1, query_batch_size):
            self.chembl_handler.populate_synonyms_for_molecule(
                all_molecules[i:i + query_batch_size],
                self.chembl_handler.molecule2synonyms)

        #setup elasticsearch
        if not dry_run:
            self.loader.create_new_index(
                Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME)
            #need to directly get the versioned index name for this function
            self.loader.prepare_for_bulk_indexing(
                self.loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME))

        #process targets
        '''get gene simplified objects and push them to the processing queue'''
        for i, target in enumerate(self.esquery.get_all_targets()):
            target[SearchObjectTypes.__ROOT__] = SearchObjectTypes.TARGET
            self.handle_search_object(target, dry_run)

        #process diseases
        '''get disease objects  and push them to the processing queue'''
        self.logger.info(
            'get disease objects and push them to the processing queue')
        for i, disease in enumerate(self.esquery.get_all_diseases()):
            disease[SearchObjectTypes.__ROOT__] = SearchObjectTypes.DISEASE
            self.handle_search_object(disease, dry_run)

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()

        self.logger.info("DONE")

    def summarise_association(self, data):
        def cap_score(value):
            if value > 1:
                return 1.0
            elif value < -1:
                return -1
            return value

        return dict(total=[
            dict(id=data_point['id'],
                 score=cap_score(data_point['harmonic-sum']['overall']))
            for data_point in data['total']
        ],
                    direct=[
                        dict(id=data_point['id'],
                             score=cap_score(
                                 data_point['harmonic-sum']['overall']))
                        for data_point in data['direct']
                    ])

    def handle_search_object(self, data, dry_run):
        '''process objects to simple search object'''
        so = self.data_handlers[data[SearchObjectTypes.__ROOT__]]()
        so.digest(json_input=data)
        '''inject drug data'''
        if not hasattr(so, 'drugs'):
            so.drugs = {}
        so.drugs['evidence_data'] = []
        '''count associations '''
        if data[SearchObjectTypes.__ROOT__] == SearchObjectTypes.TARGET:
            ass_data = self.esquery.get_associations_for_target(
                data['id'], fields=['id', 'harmonic-sum.overall'], size=20)
            so.set_associations(
                self.summarise_association(ass_data.top_associations),
                ass_data.associations_count)
            if so.id in self.chembl_handler.target2molecule:
                drugs_synonyms = set()
                for molecule in self.chembl_handler.target2molecule[so.id]:
                    if molecule in self.chembl_handler.molecule2synonyms:
                        drugs_synonyms = drugs_synonyms | set(
                            self.chembl_handler.molecule2synonyms[molecule])
                so.drugs['evidence_data'] = list(drugs_synonyms)

        elif data[SearchObjectTypes.__ROOT__] == SearchObjectTypes.DISEASE:
            ass_data = self.esquery.get_associations_for_disease(
                data['path_codes'][0][-1],
                fields=['id', 'harmonic-sum.overall'],
                size=20)
            so.set_associations(
                self.summarise_association(ass_data.top_associations),
                ass_data.associations_count)
            if so.id in self.chembl_handler.disease2molecule:
                drugs_synonyms = set()
                for molecule in self.chembl_handler.disease2molecule[so.id]:
                    if molecule in self.chembl_handler.molecule2synonyms:
                        drugs_synonyms = drugs_synonyms | set(
                            self.chembl_handler.molecule2synonyms[molecule])
                so.drugs['evidence_data'] = list(drugs_synonyms)
        else:
            so.set_associations()
        '''store search objects'''
        if not dry_run:
            self.loader.put(
                Const.ELASTICSEARCH_DATA_SEARCH_INDEX_NAME,
                Const.ELASTICSEARCH_DATA_SEARCH_DOC_NAME + '-' + so.type,
                so.id, so.to_json())