def write_hpo_edge(self, fh: TextIO, subject: str, edge_label: str, object: str, relation: str) -> None: # ['subject', 'edge_label', 'object', 'relation', 'publications'] write_node_edge_item(fh=fh, header=self.edge_header, data=[subject, edge_label, object, relation, ""])
def test_write_node_edge_item(self): write_node_edge_item(fh=self.fh, header=self.header, data=self.valid_data) self.fh.close() self.assertTrue(os.path.exists(self.outfile)) with open(self.outfile, 'r') as tsvfile: lines = tsvfile.read().split('\n') self.assertEqual(['id1234', '1234', 'biolink:Gene'], lines[0].split('\t'))
def test_write_node_edge_item_with_tabs_in_data(self): write_node_edge_item( fh=self.fh, header=self.header, data=['id1234', '1234', 'biolink:Gene\tbiolink:Gene\t'], sanitize_sep_char=True) self.fh.close() self.assertTrue(os.path.exists(self.outfile)) with open(self.outfile, 'r') as tsvfile: lines = tsvfile.read().split('\n') self.assertEqual( ['id1234', '1234', 'biolink:Gene0x9biolink:Gene0x9'], lines[0].split('\t'))
def write_hpo_node(self, fh: TextIO, id: str, data: dict, node_type: str) -> None: # Try to get comments/def in case this is useful for ML try: comment_field = get_item_by_priority(data, ['comment']) except ItemInDictNotFound: comment_field = "" try: description = get_item_by_priority(data, ['def']) except ItemInDictNotFound: description = "" try: name_field = get_item_by_priority(data, ['name']) except ItemInDictNotFound: name_field = "" write_node_edge_item( fh=fh, header=self.node_header, data=[id, name_field, node_type, comment_field, description])
def run(self, data_file: str = None): # file housekeeping os.makedirs(self.output_dir, exist_ok=True) gpi_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpi") gpa_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpa") with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge: # write headers node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") with open(gpi_file, 'r') as gpi_fh: for rec in _gpi12iterator(gpi_fh): node_data = self.gpi_to_gene_node_data(rec) write_node_edge_item(node, self.node_header, node_data) with open(gpa_file, 'r') as gpa_fh: for rec in _gpa11iterator(gpa_fh): edge_data = self.gpa_to_edge_data(rec) write_node_edge_item(edge, self.edge_header, edge_data)
def run(self, data_file: str = None): # file housekeeping os.makedirs(self.output_dir, exist_ok=True) gpi_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpi") gpa_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpa") with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge: # write headers node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") seen = set() with open(gpi_file, 'r') as gpi_fh: for rec in _gpi12iterator(gpi_fh): node_data = self.gpi_to_gene_node_data(rec) seen.add(node_data[0]) write_node_edge_item(node, self.node_header, node_data) with open(gpa_file, 'r') as gpa_fh: for rec in _gpa11iterator(gpa_fh): edge_data = self.gpa_to_edge_data(rec) subject_node = edge_data[0] if subject_node not in seen: subject_node_data = [ subject_node, guess_bl_category(subject_node) ] + [""] * 5 + [self.source_name] write_node_edge_item(node, self.node_header, subject_node_data) seen.add(subject_node) object_node = edge_data[2] if object_node not in seen: object_node_data = [ object_node, guess_bl_category(object_node) ] + [""] * 5 + [self.source_name] write_node_edge_item(node, self.node_header, object_node_data) seen.add(object_node) write_node_edge_item(edge, self.edge_header, edge_data)
def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record: Dict) -> None: """Parse term-cooccurrences. Args: node_handle: File handle for nodes.csv. edge_handle: File handle for edges.csv. record: A dictionary corresponding to a row from a table. Returns: None. """ terms = set() paper_id = record['document_id'] if not pd.isna(record['entity_uris']): terms.update(record['entity_uris'].split('|')) # add a biolink:Publication for each paper if paper_id.endswith('.xml'): paper_id = paper_id.replace('.xml', '') paper_curie = f"CORD:{paper_id}" if paper_id not in self.seen: write_node_edge_item( fh=node_handle, header=self.node_header, data=[paper_curie, "", "biolink:Publication", ""]) self.seen.add(paper_id) for t in terms: if len(t) == 2: # country code if t in self.country_code_map: mapped_t = self.country_code_map[t][0] name = self.country_code_map[t][1] curie = self.contract_uri(mapped_t) else: name = "" curie = self.contract_uri(t) category = 'biolink:NamedThing' else: category = 'biolink:OntologyClass' curie = self.contract_uri(t) name = self.concept_name_map[ t] if t in self.concept_name_map else "", if t not in self.seen: # add a biolink:OntologyClass node for each term write_node_edge_item( fh=node_handle, header=self.node_header, data=[ f"{curie}", name if isinstance(name, str) else "", category, "" ]) self.seen.add(curie) # simplified generation of edges between OntologyClass and the publication where # OntologyClass -> correlated_with -> Publication # with the edge having relation RO:0002610 write_node_edge_item( fh=edge_handle, header=self.edge_header, data=[ f"{curie}", "biolink:correlated_with", f"{paper_curie}", f"RO:0002610", # 'correlated with' f"{self.source_name} co-occurrences" ])
def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str = None) -> None: """Parse a JSON document corresponding to a publication. Args: node_handle: File handle for nodes.csv. edge_handle: File handle for edges.csv. doc: JSON document as dict. subset: The subset name for this dataset. Returns: None. """ terms = set() paper_id = doc['paper_id'] title = None if 'metadata' in doc: metadata = doc['metadata'] title = metadata['title'].replace('\n', ' ') # extract hits from metadata terms.update(self.extract_termite_hits(metadata)) if 'abstract' in doc: abstract = doc['abstract'] # extract hits from abstract for x in abstract: terms.update(self.extract_termite_hits(x)) if 'body_text' in doc: body_text = doc['body_text'] # extract hits from body text for x in body_text: terms.update(self.extract_termite_hits(x)) provided_by = f"{self.source_name}" if subset: provided_by += f" {subset}" # add a biolink:Publication for each paper write_node_edge_item( fh=node_handle, header=self.node_header, data=[f"CORD:{paper_id}", f"{title}", "biolink:Publication", ""]) self.seen.add(paper_id) for t in terms: if len(t) == 2: # country code if t in self.country_code_map: mapped_t = self.country_code_map[t][0] name = self.country_code_map[t][1] curie = self.contract_uri(mapped_t) else: name = "" curie = self.contract_uri(t) category = 'biolink:NamedThing' else: category = 'biolink:OntologyClass' curie = self.contract_uri(t) name = self.concept_name_map[ t] if t in self.concept_name_map else "", if t not in self.seen: # add a biolink:OntologyClass node for each term write_node_edge_item(fh=node_handle, header=self.node_header, data=[ f"{curie}", name if isinstance(name, str) else "", category, "" ]) self.seen.add(curie) # add has_annotation edge between OntologyClass and Publication write_node_edge_item(fh=edge_handle, header=self.edge_header, data=[ f"{curie}", f"biolink:related_to", f"CORD:{paper_id}", "SIO:000255", provided_by ])
def test_write_node_edge_item_bad_fh(self): with self.assertRaises(Exception): write_node_edge_item(fh='', header=self.header, data=self.valid_data)
def run(self, data_file: Optional[str] = None) -> None: """Method is called and performs needed transformations to process SARS-CoV-2 subset of ChEMBL. http://chembl.blogspot.com/2020/05/chembl27-sars-cov-2-release.html Args: data_file: data file to parse Returns: None. """ self.node_header = ['id', 'name', 'category', 'provided_by'] self.edge_header = [ 'id', 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'type' ] # ChEMBL molecules data = self.get_chembl_molecules() molecule_nodes = self.parse_chembl_molecules(data) # ChEMBL assay data = self.get_chembl_assays() assay_nodes = self.parse_chembl_assay(data) # ChEMBL document data = self.get_chembl_documents() document_nodes = self.parse_chembl_document(data) # ChEMBL activity data = self.get_chembl_activities() activity_edges = self.parse_chembl_activity(data) self.node_header.extend( [x for x in self._node_header if x not in self.node_header]) self.edge_header.extend( [x for x in self._edge_header if x not in self.edge_header]) node_handle = open(self.output_node_file, 'w') edge_handle = open(self.output_edge_file, 'w') node_handle.write("\t".join(sorted(self.node_header)) + "\n") edge_handle.write("\t".join(sorted(self.edge_header)) + "\n") for n in molecule_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) for n in assay_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) for n in document_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) # write node for organisms in TAXON_MAP for org_curie, org_name in {v: k for k, v in TAXON_MAP.items()}.items(): o = { 'id': org_curie, 'name': org_name, 'category': 'biolink:OrganismTaxon' } write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ o[x] if x in o else '' for x in sorted(self.node_header) ]) for e in activity_edges: write_node_edge_item(fh=edge_handle, header=sorted(self.edge_header), data=[ e[x] if x in e else '' for x in sorted(self.edge_header) ])
def run(self, data_file: Optional[str] = None, chembl_data_files: Optional[dict] = None) -> None: """Method is called and performs needed transformations to process SARS-CoV-2 subset of ChEMBL. http://chembl.blogspot.com/2020/05/chembl27-sars-cov-2-release.html Args: data_file: NOT USED - preserves to placate mypy. Use "data_files" instead chembl_data_files: data files to parse Returns: None. """ self.node_header = ['id', 'name', 'category', 'provided_by'] self.edge_header = [ 'id', 'subject', 'predicate', 'object', 'relation', 'provided_by', 'type' ] if chembl_data_files is None: chembl_data_files = { 'molecules_data': 'data/raw/chembl_molecule_records.json', 'assay_data': 'data/raw/chembl_assay_records.json', 'document_data': 'data/raw/chembl_document_records.json', 'activity_data': 'data/raw/chembl_activity_records.json' } # ChEMBL molecules molecules_data = self.read_json(chembl_data_files['molecules_data']) molecule_nodes = self.parse_chembl_molecules(molecules_data) # ChEMBL assay assays_data = self.read_json(chembl_data_files['assay_data']) assay_nodes = self.parse_chembl_assay(assays_data) # ChEMBL document documents_data = self.read_json(chembl_data_files['document_data']) document_nodes = self.parse_chembl_document(documents_data) # ChEMBL activity activities_data = self.read_json(chembl_data_files['activity_data']) activity_edges = self.parse_chembl_activity(activities_data) self.node_header.extend( [x for x in self._node_header if x not in self.node_header]) self.edge_header.extend( [x for x in self._edge_header if x not in self.edge_header]) node_handle = open(self.output_node_file, 'w') edge_handle = open(self.output_edge_file, 'w') node_handle.write("\t".join(sorted(self.node_header)) + "\n") edge_handle.write("\t".join(sorted(self.edge_header)) + "\n") for n in molecule_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) for n in assay_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) for n in document_nodes: write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ n[x] if x in n else '' for x in sorted(self.node_header) ]) # write node for organisms in TAXON_MAP for org_curie, org_name in {v: k for k, v in TAXON_MAP.items()}.items(): o = { 'id': org_curie, 'name': org_name, 'category': 'biolink:OrganismTaxon' } write_node_edge_item(fh=node_handle, header=sorted(self.node_header), data=[ o[x] if x in o else '' for x in sorted(self.node_header) ]) for e in activity_edges: write_node_edge_item(fh=edge_handle, header=sorted(self.edge_header), data=[ e[x] if x in e else '' for x in sorted(self.edge_header) ])
def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str = None) -> None: """Parse a JSON document corresponding to a publication. Args: node_handle: File handle for nodes.csv. edge_handle: File handle for edges.csv. doc: JSON document as dict. subset: The subset name for this dataset. Returns: None. """ paper_id = doc['paper_id'] metadata = doc['metadata'] abstract = doc['abstract'] body_text = doc['body_text'] terms = set() provided_by = f"{self.source_name}" if subset: provided_by += f" {subset}" # extract hits from metadata terms.update(self.extract_termite_hits(metadata)) # extract hits from abstract for x in abstract: terms.update(self.extract_termite_hits(x)) # extract hits from body text for x in body_text: terms.update(self.extract_termite_hits(x)) # add a biolink:Publication for each paper write_node_edge_item(fh=node_handle, header=self.node_header, data=[ f"CORD:{paper_id}", f"{metadata['title']}", "biolink:Publication", "" ]) self.seen.add(paper_id) # TODO: use CURIE for terms for t in terms: if t not in self.seen: # add a biolink:OntologyClass node for each term write_node_edge_item(fh=node_handle, header=self.node_header, data=[ f"{t}", f"{self.concept_name_map[t]}", "biolink:OntologyClass" if len(t) != 2 else "biolink:NamedThing", "" ]) self.seen.add(t) # add has_annotation edge between OntologyClass and Publication write_node_edge_item(fh=edge_handle, header=self.edge_header, data=[ f"{t}", f"biolink:related_to", f"CORD:{paper_id}", "SIO:000255", provided_by ])
def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record: Dict) -> None: """Parse term-cooccurrences. Args: node_handle: File handle for nodes.csv. edge_handle: File handle for edges.csv. record: A dictionary corresponding to a row from a table. Returns: None. """ terms = set() paper_id = record['document_id'] if not pd.isna(record['entity_uris']): terms.update(record['entity_uris'].split('|')) # add a biolink:Publication for each paper if paper_id not in self.seen: write_node_edge_item( fh=node_handle, header=self.node_header, data=[f"CORD:{paper_id}", "", "biolink:Publication", ""]) self.seen.add(paper_id) for t in terms: if t not in self.seen: # add a biolink:OntologyClass node for each term write_node_edge_item( fh=node_handle, header=self.node_header, data=[ f"{t}", self.concept_name_map[t] if t in self.concept_name_map else "", "biolink:OntologyClass" if len(t) != 2 else "biolink:NamedThing", "" ]) self.seen.add(t) information_entity = uuid.uuid1() write_node_edge_item(fh=node_handle, header=self.node_header, data=[ f"{uuid.uuid1()}", "", "biolink:InformationContentEntity", "" ]) # add has_annotation edge between co-occurrence entity and publication write_node_edge_item( fh=edge_handle, header=self.edge_header, data=[ f"{information_entity}", "biolink:related_to", f"{record['document_id']}", "SIO:000255", # 'has annotation' f"{self.source_name}" ]) for t in terms: # add has_member edges between co-occurrence entity and each term write_node_edge_item( fh=edge_handle, header=self.edge_header, data=[ f"{information_entity}", "biolink:related_to", f"{t}", f"SIO:000059", # 'has member' f"{self.source_name}" ])
def run(self, data_file: Optional[str] = None): ttd_file_name = os.path.join(self.input_base_dir, "P1-01-TTD_target_download.txt") ttd_data = self.parse_ttd_file(ttd_file_name) gene_node_type = "biolink:Protein" drug_id_prefix = "TTD:" drug_node_type = "biolink:Drug" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with uniprot_curie_prefix = "UniProtKB:" self.node_header = ['id', 'name', 'category', 'TTD_ID', 'provided_by'] self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'target_type' ] # make name to id map for uniprot names of human proteins dat_gz_id_file = os.path.join(self.input_base_dir, "HUMAN_9606_idmapping.dat.gz") name_2_id_map = uniprot_make_name_to_id_mapping(dat_gz_id_file) # transform data, something like: with open(self.output_node_file, 'w') as node,\ open(self.output_edge_file, 'w') as edge: # write headers (change default node/edge headers if necessary node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") for target_id, data in ttd_data.items(): # WRITE NODES # skip items that don't refer to UNIPRO gene targets or don't have # drug info if 'UNIPROID' not in data: continue if 'DRUGINFO' not in data: continue # # make node for gene(s) # uniproids: list = self.get_uniproids(data, name_2_id_map, uniprot_curie_prefix) gene_name = self.get_gene_name(data) # gene - ['id', 'name', 'category', 'ttd id for this target'] for this_id in uniproids: write_node_edge_item(fh=node, header=self.node_header, data=[ this_id, gene_name, gene_node_type, target_id, self.source_name ]) # for each drug in DRUGINFO: for this_drug in data['DRUGINFO']: this_drug_curie = drug_id_prefix + this_drug[0] # # make node for drug # write_node_edge_item(fh=node, header=self.node_header, data=[ this_drug_curie, this_drug[1], drug_node_type, this_drug[0], self.source_name ]) # # make edges for target gene ids <-> drug # targ_type = self.get_targ_type(data) # ['subject', 'edge_label', 'object', 'relation', 'comment'] for this_id in uniproids: write_node_edge_item(fh=edge, header=self.edge_header, data=[ this_drug_curie, drug_gene_edge_label, this_id, drug_gene_edge_relation, self.source_name, targ_type ])
def run(self) -> None: self.node_header.append( "TTD_ID") # append ttd id for drug targets and drugs ttd_file_name = os.path.join(self.input_base_dir, "P1-01-TTD_target_download.txt") ttd_data = self.parse_ttd_file(ttd_file_name) gene_node_type = "biolink:Protein" drug_node_type = "biolink:Drug" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with uniprot_curie_prefix = "UniProtKB:" self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'target_type' ] # make name to id map for uniprot names of human proteins dat_gz_id_file = os.path.join(self.input_base_dir, "HUMAN_9606_idmapping.dat.gz") name_2_id_map = uniprot_make_name_to_id_mapping(dat_gz_id_file) # transform data, something like: with open(self.output_node_file, 'w') as node,\ open(self.output_edge_file, 'w') as edge: # write headers (change default node/edge headers if necessary node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") for target_id, data in ttd_data.items(): # WRITE NODES # skip items that don't refer to UNIPRO gene targets or don't have # drug info if 'UNIPROID' not in data: logging.info( "Skipping item that doesn't refer to UNIPROT gene") continue if 'DRUGINFO' not in data: logging.info( "Skipping item that doesn't have any drug info") continue # # make node for gene # uniproid = self.get_uniproid(data, name_2_id_map, uniprot_curie_prefix) gene_name = self.get_gene_name(data) # gene - ['id', 'name', 'category', 'ttd id for this target'] write_node_edge_item( fh=node, header=self.node_header, data=[uniproid, gene_name, gene_node_type, target_id]) # for each drug in DRUGINFO: for this_drug in data['DRUGINFO']: # # make node for drug # write_node_edge_item(fh=node, header=self.node_header, data=[ this_drug[0], this_drug[1], drug_node_type, this_drug[0] ]) # # make edge for target <-> drug # targ_type = self.get_targ_type(data) # ['subject', 'edge_label', 'object', 'relation', 'comment'] write_node_edge_item(fh=edge, header=self.edge_header, data=[ target_id, drug_gene_edge_label, uniproid, drug_gene_edge_relation, targ_type ])