Пример #1
0
def score_producer(data, scorer, loader, r_server, lookup_data,
                   datasources_to_datatypes, dry_run):
    target, disease, evidence, is_direct = data

    logger = logging.getLogger(__name__)

    if evidence:
        score = scorer.score(target, disease, evidence, is_direct,
                             datasources_to_datatypes)
        # skip associations only with data with score 0
        if score:

            gene_data = Gene()
            try:
                gene_data.load_json(
                    lookup_data.available_genes.get_gene(target, r_server))

            except KeyError as e:
                logger.debug('Cannot find gene code "%s" '
                             'in lookup table' % target)
                raise e
            score.set_target_data(gene_data)

            # create a hpa expression empty jsonserializable class
            # to fill from Redis cache lookup_data
            hpa_data = HPAExpression()
            try:
                hpa_data.update(
                    lookup_data.available_hpa.get_hpa(target, r_server))
            except KeyError:
                pass
            except Exception as e:
                raise e
            try:
                score.set_hpa_data(hpa_data)
            except KeyError:
                pass
            except Exception as e:
                raise e

            disease_data = EFO()
            try:
                disease_data.load_json(
                    lookup_data.available_efos.get_efo(disease, r_server))
            except KeyError as e:
                logger.debug('Cannot find EFO code "%s" '
                             'in lookup table' % disease)
                logger.exception(e)

            score.set_disease_data(disease_data)

            element_id = '%s-%s' % (target, disease)
            if not dry_run:
                loader.put(Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME,
                           Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME,
                           element_id, score)

        else:
            logger.warning('Skipped association with score 0: %s-%s' %
                           (target, disease))
Пример #2
0
    def merge_data(self, genes, loader, r_server, data_config):

        esquery = ESQuery(loader.es)

        try:
            count = esquery.count_elements_in_index(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
        except NotFoundError as ex:
            self._logger.error(
                'no Ensembl index in ES. Skipping. Has the --ensembl step been run? Are you pointing to the correct index? %s'
                % ex)
            raise ex

        for row in esquery.get_all_ensembl_genes():
            if row['id'] in genes:
                gene = genes.get_gene(row['id'])
                gene.load_ensembl_data(row)
                genes.add_gene(gene)
            else:
                gene = Gene()
                gene.load_ensembl_data(row)
                genes.add_gene(gene)

        self._clean_non_reference_genes(genes)

        self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
Пример #3
0
def score_producer(data, 
        scorer, lookup_data, datasources_to_datatypes, dry_run):
    target, disease, evidence, is_direct = data

    if evidence:
        score = scorer.score(target, disease, evidence, is_direct, 
            datasources_to_datatypes)
        # skip associations only with data with score 0
        if score: 

            gene_data = Gene()
            gene_data_index = lookup_data.available_genes.get_gene(target)
            if gene_data_index != None:
                gene_data.load_json(gene_data_index)
            score.set_target_data(gene_data)

            # create a hpa expression empty jsonserializable class
            hpa_data = HPAExpression()
            try:
                hpa_index = lookup_data.available_hpa.get_hpa(target)
                if hpa_index is not None:
                    hpa_data.update(hpa_index)
            except KeyError:
                pass
            except Exception as e:
                raise e
            try:
                score.set_hpa_data(hpa_data)
            except KeyError:
                pass
            except Exception as e:
                raise e


            disease_data = EFO()
            disease_data.load_json(
                lookup_data.available_efos.get_efo(disease))

            score.set_disease_data(disease_data)


            element_id = '%s-%s' % (target, disease)

            #convert the score into a JSON-compatible object
            #otherwise Python serialization consumes too much memory
            return (element_id, score.to_json())

        return None
Пример #4
0
    def merge_data(self, genes, loader, r_server, data_config):

        self._logger.info("HGNC parsing - requesting from URL %s",
                          data_config.hgnc_complete_set)

        with URLZSource(data_config.hgnc_complete_set).open() as source:

            data = json.load(source)

            for row in data['response']['docs']:
                gene = Gene()
                gene.load_hgnc_data_from_json(row)
                genes.add_gene(gene)

            self._logger.info("STATS AFTER HGNC PARSING:\n" +
                              genes.get_stats())
Пример #5
0
    def merge_data(self, genes, es, r_server, data_config, es_config):

        index = es_config.ens.name

        for row in Search().using(es).index(index).query(MatchAll()).scan():
            gene = None
            if row['id'] in genes:
                gene = genes.get_gene(row['id'])
            else:
                gene = Gene()
            self.load_ensembl_data(gene, row)
            genes.add_gene(gene)

        self._clean_non_reference_genes(genes)

        self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
Пример #6
0
    def merge_data(self, genes, es, r_server, data_config, es_config):

        ensembl_filename = data_config.ensembl_filename
        with URLZSource(ensembl_filename).open() as ensembl_file:
            for line in ensembl_file:
                content = json.loads(line)

                if content['id'] in genes:
                    gene = genes.get_gene(content['id'])
                else:
                    gene = Gene()
                self.load_ensembl_data(gene, content)
                genes.add_gene(gene)

        self._clean_non_reference_genes(genes)

        self._logger.info("STATS AFTER ENSEMBL PARSING:\n" + genes.get_stats())
Пример #7
0
 def _get_gene_obj(self, geneid):
     gene = Gene(geneid)
     gene.load_json(self.available_genes.get_gene(geneid))
     return gene