예제 #1
0
    def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator):
        all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes)
        for source_node_id, results in all_results.items():
            # convert the simple edges and nodes to rags objects and write them to the graph
            for (edge, node) in results:
                gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties)
                if self.recreate_sv_node:
                    variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])
                    writer.write_node(variant_node)
                if gene_node.id not in self.written_genes:
                    writer.write_node(gene_node)
                    self.written_genes.add(gene_node.id)

                predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label)
                gene_edge = KEdge(source_id=source_node_id,
                                  target_id=gene_node.id,
                                  provided_by=edge.provided_by,
                                  ctime=edge.ctime,
                                  original_predicate=predicate,
                                  # standard_predicate=predicate,
                                  input_id=edge.input_id,
                                  properties=edge.properties)
                writer.write_edge(gene_edge)
            logger.info(f'added {len(results)} variant relationships for {source_node_id}')
예제 #2
0
 def parse_nodes(self, limit=0):
     """
     Parse nodes.
     :param limit: for testing reads first n nodes from file
     :return: dict with node_id as key and KNode as value
     """
     print('parsing nodes...')
     limit_counter = 0
     with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file:
         reader = csv.DictReader(nodes_file, delimiter='\t')
         for raw_node in reader:
             # transform headers to knode attrbutes
             labels = raw_node.get('semantic_type')
             labels = labels.replace(']', '').replace('[', '').replace(
                 '\\', '').replace("'", '')
             labels = labels.split(',')
             node = KNode({
                 'id': raw_node.get('normalized_curie'),
                 'type': labels[0],
                 'name': raw_node.get('name'),
                 'properties': {
                     'input_term': raw_node.get('input_term')
                 }
             })
             node.add_export_labels(labels)
             limit_counter += 1
             if limit and limit_counter > limit:
                 break
             yield limit_counter - 1, node
예제 #3
0
    def start_build(self) -> list:
        # Entry point
        variant_list = self.get_all_variants_and_synonymns()
        if not variant_list:
            logger.info('No Sequence variant nodes found from graph.')
        variant_subset = []
        with self.writerDelegator as writer:
            # for each variant
            for var in variant_list:
                # check to see if we have all the data elements we need. element [0] is the ID, element [1] is the synonym list
                if len(var) == 2:
                    # create a variant node
                    variant_curie = var[0]

                    # get the synonym data from the graph DB call
                    variant_syn_set = set(var[1])

                    variant_node = KNode(variant_curie, type=node_types.SEQUENCE_VARIANT)
                    variant_node.add_synonyms(variant_syn_set)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])

                    variant_subset.append(variant_node)
                    if len(variant_subset) == 1000:
                        self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer)
                        variant_subset = []
            if variant_subset:
                # for left overs
                self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer)
 def parse_dict_to_knode(nn_dict: dict) -> KNode:
     node = KNode(
         id=nn_dict.get('id', {}).get('identifier', ''),
         name=nn_dict.get('id', {}).get('label', ''),
         type=nn_dict.get('type', ['named_thing'])[0],
     )
     node.add_synonyms(
         set(
             map(lambda x: LabeledID(**x),
                 nn_dict.get('equivalent_identifiers', []))))
     node.add_export_labels(nn_dict.get('type', ['named_thing']))
     return node
예제 #5
0
def test_write_node_with_export_lables():
    # assert that a node will be queued to its export types
    node = KNode('CURIE:1', type=node_types.NAMED_THING)
    all_types = [node_types.CHEMICAL_SUBSTANCE, node_types.NAMED_THING]
    node.add_export_labels(all_types)
    bf = BufferedWriter(rosetta_mock)
    bf.write_node(node)
    assert node.id in bf.written_nodes
    key = node.export_labels
    assert key in bf.node_queues
    queue = bf.node_queues[key]
    assert node.id in queue
    def get_nodes_from_file(self, file_name, delimiter: str):
        if not file_name:
            return

        with open(file_name) as nodes_file:
            reader = csv.DictReader(nodes_file, delimiter=delimiter)
            for raw_node in reader:
                labels = list(
                    filter(lambda x: x, raw_node['category'].split('|')))
                if not len(labels):
                    labels = ['named_thing']
                id = raw_node['id']
                name = raw_node['name']
                node = KNode(id, type=labels[0], name=name)
                node.add_export_labels(labels)
                yield node
예제 #7
0
 def parse_drug_bank_items(self):
     import requests
     from contextlib import closing
     drug_bank_parsed_tsv = 'https://raw.githubusercontent.com/TranslatorIIPrototypes/CovidDrugBank/master/trials.txt'
     items = []
     tsv_file = requests.get(drug_bank_parsed_tsv, ).text.split('\n')
     reader = csv.DictReader(tsv_file, delimiter="\t")
     for row in reader:
         items.append(row)
     drug_ids = '&'.join([f"curie={item['source']}" for item in items])
     normalize_url = f"https://nodenormalization-sri.renci.org/get_normalized_nodes?{drug_ids}"
     response = requests.get(normalize_url).json()
     nodes = []
     export_labels_fallback = requests.get(
         'https://bl-lookup-sri.renci.org/bl/chemical_substance/ancestors?version=latest'
     ).json()
     export_labels_fallback.append('chemical_substance')
     for drug_id in response:
         node = None
         if response[drug_id] == None:
             node = KNode(drug_id, type='chemical_substance')
             node.add_export_labels(export_labels_fallback)
         else:
             # else use synonimized id so edges are merged
             prefered_curie = response[drug_id]['id']['identifier']
             node = KNode(prefered_curie, type="chemical_substance")
         nodes.append(node)
         self.writer.write_node(node)
     self.writer.flush()
     ## 'manually write in_clinical_trial_for edges
     query = lambda source_id, target_id, count: f"""
     MATCH (a:chemical_substance{{id: '{source_id}'}}) , (b:disease{{id:'{target_id}'}})
     Merge (a)-[e:in_clinical_trial_for{{id: apoc.util.md5([a.id, b.id, 'ROBOKOVID:in_clinical_trial_for']), predicate_id: 'ROBOKOVID:in_clinical_trial_for'}}]->(b)
     SET e.edge_source = "https://www.drugbank.ca/covid-19"
     SET e.relation_label = "in_clinical_trial_for"
     SET e.source_database = "drugbank"
     SET e.predicate_id = "ROBOKOVID:in_clinical_trial_for"
     SET e.relation = "in_clinical_trial_for"
     SET e.count = {count}
     """
     with self.rosetta.type_graph.driver.session() as session:
         for source_node, row in zip(nodes, items):
             q = query(source_node.id, row['object'],
                       row['count'])  # assuming  MONDO:0100096 is in there
             session.run(q)
예제 #8
0
    def parse_gwas_file(self, gwas_catalog):

        try:
            # get column headers
            file_headers = gwas_catalog[0].split('\t')
            pub_med_index = file_headers.index('PUBMEDID')
            p_value_index = file_headers.index('P-VALUE')
            snps_index = file_headers.index('SNPS')
            trait_ids_index = file_headers.index('MAPPED_TRAIT_URI')
        except (IndexError, ValueError) as e:
            logger.error(f'GWAS Catalog failed to prepopulate_cache ({e})')
            return []

        corrupted_lines = 0
        missing_variant_ids = 0
        missing_phenotype_ids = 0
        variant_to_pheno_cache = defaultdict(set)
        progress_counter = 0
        total_lines = len(gwas_catalog)
        trait_uri_pattern = re.compile(r'[^,\s]+')
        snp_pattern = re.compile(r'[^,;x*\s]+')
        for line in gwas_catalog[1:]:

            line = line.split('\t')
            try:
                # get pubmed id
                pubmed_id = line[pub_med_index]
                # get p-value
                p_value = float(line[p_value_index])
                if p_value == 0:
                    p_value = sys.float_info.min
                # get all traits (possible phenotypes)
                trait_uris = trait_uri_pattern.findall(line[trait_ids_index])
                # find all sequence variants
                snps = snp_pattern.findall(line[snps_index])
            except (IndexError, ValueError) as e:
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {e}')
                continue

            if not (trait_uris and snps):
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {line}')
                continue
            else:
                traits = []
                for trait_uri in trait_uris:
                    try:
                        trait_id = trait_uri.rsplit('/', 1)[1]
                        # ids show up like EFO_123, Orphanet_123, HP_123
                        if trait_id.startswith('EFO'):
                            curie_trait_id = f'EFO:{trait_id[4:]}'
                        elif trait_id.startswith('Orp'):
                            curie_trait_id = f'ORPHANET:{trait_id[9:]}'
                        elif trait_id.startswith('HP'):
                            curie_trait_id = f'HP:{trait_id[3:]}'
                        elif trait_id.startswith('NCIT'):
                            curie_trait_id = f'NCIT:{trait_id[5:]}'
                        elif trait_id.startswith('MONDO'):
                            curie_trait_id = f'MONDO:{trait_id[6:]}'
                        elif trait_id.startswith('GO'):
                            # Biological process or activity
                            # 5k+ of these
                            missing_phenotype_ids += 1
                            continue
                        else:
                            missing_phenotype_ids += 1
                            logger.warning(
                                f'{trait_uri} not a recognized trait format')
                            continue

                        traits.append(curie_trait_id)

                    except IndexError as e:
                        logger.warning(
                            f'trait uri index error:({trait_uri}) not splittable'
                        )

                variant_nodes = set()
                for n, snp in enumerate(snps):
                    if snp.startswith('rs'):
                        dbsnp_curie = f'DBSNP:{snp}'
                        variant_node = KNode(dbsnp_curie,
                                             type=node_types.SEQUENCE_VARIANT)
                        # adding an export label, this will ensure that it will go into the proper queue
                        # hence we can do batch normalization in the writer.
                        variant_node.add_export_labels(
                            [node_types.SEQUENCE_VARIANT])
                        variant_nodes.add(variant_node)
                    else:
                        missing_variant_ids += 1
                        pass

                if traits and variant_nodes:
                    props = {'p_value': p_value}
                    for variant_node in variant_nodes:
                        self.writer.write_node(variant_node)
                        for trait_id in traits:
                            variant_to_pheno_edge, phenotype_node = self.create_variant_to_phenotype_components(
                                variant_node,
                                trait_id,
                                None,
                                pubmed_id=pubmed_id,
                                properties=props)
                            self.writer.write_node(phenotype_node)
                            self.writer.write_edge(variant_to_pheno_edge)
            progress_counter += 1
            if progress_counter % 1000 == 0:
                percent_complete = (progress_counter / total_lines) * 100
                logger.info(f'GWASCatalog progress: {int(percent_complete)}%')
    def create_gtex_graph(self,
                          data_directory: str,
                          file_name: str,
                          namespace: str,
                          is_sqtl: bool = False) -> object:
        # init the return value
        ret_val: object = None

        # init a progress counter
        line_counter = 0

        try:
            # get the full path to the input file
            full_file_path = f'{data_directory}{file_name}'

            logger.info(
                f'Creating GTEx graph data elements from file: {full_file_path}'
            )

            # walk through the gtex data file and create/write nodes and edges to the graph
            with WriterDelegator(self.rosetta) as graph_writer:
                # init these outside of try catch block
                curie_hgvs = None
                curie_uberon = None
                curie_ensembl = None

                # open the file and start reading
                with open(full_file_path, 'r') as inFH:
                    # open up a csv reader
                    csv_reader = csv.reader(inFH)

                    # read the header
                    header_line = next(csv_reader)

                    # find relevant indices
                    tissue_name_index = header_line.index('tissue_name')
                    tissue_uberon_index = header_line.index('tissue_uberon')
                    hgvs_index = header_line.index('HGVS')
                    ensembl_id_index = header_line.index('gene_id')
                    pval_nominal_index = header_line.index('pval_nominal')
                    pval_slope_index = header_line.index('slope')

                    try:
                        # for the rest of the lines in the file
                        for line in csv_reader:
                            # increment the counter
                            line_counter += 1

                            # get the data elements
                            tissue_name = line[tissue_name_index]
                            uberon = line[tissue_uberon_index]
                            hgvs = line[hgvs_index]
                            ensembl = line[ensembl_id_index].split(".", 1)[0]
                            pval_nominal = line[pval_nominal_index]
                            slope = line[pval_slope_index]

                            # create curies for the various id values
                            curie_hgvs = f'HGVS:{hgvs}'
                            curie_uberon = f'UBERON:{uberon}'
                            curie_ensembl = f'ENSEMBL:{ensembl}'
                            # create variant, gene and GTEx nodes with the HGVS, ENSEMBL or UBERON expression as the id and name
                            variant_node = KNode(
                                curie_hgvs,
                                name=hgvs,
                                type=node_types.SEQUENCE_VARIANT)
                            variant_node.add_export_labels(
                                [node_types.SEQUENCE_VARIANT])
                            gene_node = KNode(curie_ensembl,
                                              name=ensembl,
                                              type=node_types.GENE)
                            gene_node.add_export_labels([node_types.GENE])
                            gtex_node = KNode(
                                curie_uberon,
                                name=tissue_name,
                                type=node_types.ANATOMICAL_ENTITY)

                            if is_sqtl:
                                # sqtl variant to gene always uses the same predicate
                                predicate = self.variant_gene_sqtl_predicate
                            else:
                                # for eqtl use the polarity of slope to get the direction of expression.
                                # positive value increases expression, negative decreases
                                try:
                                    if float(slope) > 0.0:
                                        predicate = self.increases_expression_predicate
                                    else:
                                        predicate = self.decreases_expression_predicate
                                except ValueError as e:
                                    logger.error(
                                        f"Error casting slope to a float on line {line_counter} (slope - {slope}) {e}"
                                    )
                                    continue

                            # get a MD5 hash int of the composite hyper edge ID
                            hyper_edge_id = self.gtu.get_hyper_edge_id(
                                uberon, ensembl, hgvs)

                            # set the properties for the edge
                            edge_properties = [
                                ensembl, pval_nominal, slope, namespace
                            ]

                            ##########################
                            # data details are ready. write all edges and nodes to the graph DB.
                            ##########################

                            # write out the sequence variant node
                            graph_writer.write_node(variant_node)

                            # write out the gene node
                            if gene_node.id not in self.written_genes:
                                graph_writer.write_node(gene_node)
                                self.written_genes.add(gene_node.id)

                            # write out the anatomical gtex node
                            if gtex_node.id not in self.written_anatomical_entities:
                                graph_writer.write_node(gtex_node)
                                self.written_anatomical_entities.add(
                                    gtex_node.id)

                            # associate the sequence variant node with an edge to the gtex anatomy node
                            self.gtu.write_new_association(
                                graph_writer, variant_node, gtex_node,
                                self.variant_anatomy_predicate, hyper_edge_id,
                                None, True)

                            # associate the gene node with an edge to the gtex anatomy node
                            self.gtu.write_new_association(
                                graph_writer, gene_node, gtex_node,
                                self.gene_anatomy_predicate, 0, None, False)

                            # associate the sequence variant node with an edge to the gene node. also include the GTEx properties
                            self.gtu.write_new_association(
                                graph_writer, variant_node, gene_node,
                                predicate, hyper_edge_id, edge_properties,
                                True)

                            # output some feedback for the user
                            if (line_counter % 250000) == 0:
                                logger.info(
                                    f'Processed {line_counter} variants.')

                            # reset written nodes list to avoid memory overflow
                            if len(self.written_anatomical_entities
                                   ) == self.max_nodes:
                                self.written_anatomical_entities = set()
                            if len(self.written_genes) == self.max_nodes:
                                self.written_genes = set()
                    except (KeyError, IndexError) as e:
                        logger.error(
                            f'Exception caught trying to process variant: {curie_hgvs}-{curie_uberon}-{curie_ensembl} at data line: {line_counter}. Exception: {e}, Line: {line}'
                        )

        except Exception as e:
            logger.error(f'Exception caught: Exception: {e}')
            ret_val = e

        # output some final feedback for the user
        logger.info(f'Building complete. Processed {line_counter} variants.')

        # return to the caller
        return ret_val
예제 #10
0
    def sequence_variant_to_sequence_variant(self, variant_node):
        ld_url = '/ld/human/'
        options_url = '?r2=0.8'
        population = '1000GENOMES:phase_3:MXL'

        return_results = []
        # with self.redis.pipeline() as redis_pipe:
        dbsnp_curie_ids = variant_node.get_synonyms_by_prefix('DBSNP')
        for dbsnp_curie in dbsnp_curie_ids:
            variant_id = Text.un_curie(dbsnp_curie)
            query_url = f'{self.url}{ld_url}{variant_id}/{population}{options_url}'
            query_response = requests.get(
                query_url, headers={"Content-Type": "application/json"})
            if query_response.status_code == 200:
                query_json = query_response.json()
                variant_results = self.parse_ld_variants_from_ensembl(
                    query_json)
                for variant_info in variant_results:
                    new_variant_id = variant_info[0]
                    r_squared = variant_info[1]
                    props = {'r2': r_squared}
                    new_variant_curie = f'DBSNP:{new_variant_id}'
                    new_variant_node = KNode(new_variant_curie,
                                             type=node_types.SEQUENCE_VARIANT)
                    new_variant_node.add_export_labels(
                        [node_types.SEQUENCE_VARIANT])
                    edge = self.create_edge(
                        variant_node,
                        new_variant_node,
                        'ensembl.sequence_variant_to_sequence_variant',
                        dbsnp_curie,
                        self.var_to_var_predicate,
                        url=query_url,
                        properties=props)
                    return_results.append((edge, new_variant_node))
                    # new_rsid_node = None
                    # is_new_dbsnp = False
                    # synonyms = self.cache.get(f'synonymize({new_variant_curie})')
                    # if synonyms is None:
                    #     new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT)
                    #     synonyms = self.clingen.get_synonyms_by_other_ids(new_rsid_node)
                    #     redis_pipe.set(f'synonymize({new_variant_curie})', pickle.dumps(synonyms))
                    #     is_new_dbsnp = True
                    # caid_count = 0
                    # caid_node = None
                    # for synonym in synonyms:
                    #     if Text.get_curie(synonym.identifier) == 'CAID':
                    #         caid_count += 1
                    #         caid_node = KNode(synonym.identifier, name=f'{synonym.label}', type=node_types.SEQUENCE_VARIANT)
                    #         edge = self.create_edge(variant_node, caid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props)
                    #         return_results.append((edge, caid_node))
                    #         found_caid = True
                    # if caid_count > 2 we can't cache it easily right now so we skip it and let synonymizer do it later
                    # if caid_count == 1 and is_new_dbsnp:
                    # assume we didn't cache the CAID yet if the dbsnp is new and do it if needed
                    # if self.cache.get(f'synonymize({caid_node.id})') is None:
                    #     redis_pipe.set(f'synonymize({caid_node.id})',  pickle.dumps(synonyms))
                    # elif caid_count == 0:
                    #     if not new_rsid_node:
                    #         new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT)
                    #     edge = self.create_edge(variant_node, new_rsid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props)
                    #     return_results.append((edge, new_rsid_node))

                #elif query_response.status_code == 429:
                #   handle the rate limiting by waiting and retrying
                #
            else:
                logger.error(
                    f'Ensembl returned a non-200 response for {variant_node.id}: {query_response.status_code})'
                )
            # redis_pipe.execute()

        return return_results