def test_Abundance(self): """Test canonicalization of abundances.""" short = Abundance(namespace='CHEBI', name='water') self.assertEqual('a(CHEBI:water)', str(short)) long = Abundance(namespace='CHEBI', name='test name') self.assertEqual('a(CHEBI:"test name")', str(long))
def test_str_has_both(self): namespace, identifier = n(), n() node = Abundance(namespace=namespace, identifier=identifier) self.assertEqual( 'a({namespace}:{identifier})'.format(namespace=namespace, identifier=ensure_quotes(identifier)), node.as_bel(), )
def test_str_has_identifier(self): namespace, identifier = n(), n() node = Abundance(namespace=namespace, identifier=identifier) self.assertEqual( 'a({namespace}:{identifier})'.format(namespace=namespace, identifier=identifier), node.as_bel())
def setUp(self): self.graph: BELGraph = BELGraph() test_chem: Abundance = Abundance('INCHIKEY', 'AAAAZQPHATYWOK-YRBRRWAQNA-N') test_chem2: Abundance = Abundance('INCHIKEY', 'AAABHMIRDIOYOK-NPVYFSBINA-N') self.graph.add_node_from_data(test_chem) self.graph.add_node_from_data(test_chem2) test_target: Protein = Protein('EGID', '2740') test_target2: Protein = Protein('EGID', '2778') self.graph.add_node_from_data(test_target) self.graph.add_node_from_data(test_target2)
def test_simple(self): """Test converting a simple dictionary.""" namespace, name, identifier = n(), n(), n() self.assertEqual( Abundance(namespace=namespace, name=name), _simple_po_to_dict({ FUNCTION: ABUNDANCE, CONCEPT: { NAMESPACE: namespace, NAME: name, }, })) self.assertEqual( Abundance(namespace=namespace, name=name, identifier=identifier), _simple_po_to_dict({ FUNCTION: ABUNDANCE, CONCEPT: { NAMESPACE: namespace, NAME: name, IDENTIFIER: identifier, }, })) self.assertEqual( Abundance(namespace=namespace, identifier=identifier), _simple_po_to_dict({ FUNCTION: ABUNDANCE, CONCEPT: { NAMESPACE: namespace, IDENTIFIER: identifier, }, })) with self.assertRaises(ValueError): _simple_po_to_dict({ FUNCTION: ABUNDANCE, CONCEPT: { NAMESPACE: namespace, }, })
def as_bel(self) -> Optional[BaseEntity]: """Convert this term to a BEL node.""" if self.namespace == 'biological_process': return gobp( name=self.name, identifier=self.go_id, ) if self.namespace == 'cellular_component': if self.is_complex: return NamedComplexAbundance( namespace='go', name=self.name, identifier=self.go_id, ) else: return Abundance( namespace='go', name=self.name, identifier=self.go_id, )
def test_reaction(self): """Add identified reaction.""" graph = BELGraph() reaction = Reaction( namespace='rhea', identifier='44104', reactants=[ Abundance(namespace='chebi', identifier='17478'), Abundance(namespace='chebi', identifier='15377'), Abundance(namespace='chebi', identifier='57540'), ], products=[ Abundance(namespace='chebi', identifier='29067'), Abundance(namespace='chebi', identifier='15378'), Abundance(namespace='chebi', identifier='57945'), ], ) graph.add_node_from_data(reaction) self.assertEqual(7, graph.number_of_nodes()) self.assertEqual(6, graph.number_of_edges())
adgrb_complex = ComplexAbundance([adgrb1, adgrb2]) achlorhydria = Pathology(namespace='MESHD', name='Achlorhydria') akt1_rna = akt1.get_rna() akt1_gene = akt1_rna.get_gene() akt_methylated = akt1_gene.with_variants(GeneModification('Me')) akt1_phe_508_del = akt1_gene.with_variants(Hgvs('p.Phe508del')) cftr = hgnc('CFTR') cftr_protein_unspecified_variant = cftr.with_variants(HgvsUnspecified()) cftr_protein_phe_508_del = cftr.with_variants(Hgvs('p.Phe508del')) adenocarcinoma = Pathology('MESHD', 'Adenocarcinoma') interleukin_23_complex = NamedComplexAbundance('GO', 'interleukin-23 complex') oxygen_atom = Abundance(namespace='CHEBI', name='oxygen atom') hydrogen_peroxide = Abundance('CHEBI', 'hydrogen peroxide') tmprss2_gene = Gene('HGNC', 'TMPRSS2') tmprss2_erg_gene_fusion = GeneFusion( partner_5p=tmprss2_gene, range_5p=EnumeratedFusionRange('c', 1, 79), partner_3p=Gene('HGNC', 'ERG'), range_3p=EnumeratedFusionRange('c', 312, 5034) ) bcr_jak2_gene_fusion = GeneFusion( partner_5p=Gene('HGNC', 'BCR'), range_5p=EnumeratedFusionRange('c', '?', 1875), partner_3p=Gene('HGNC', 'JAK2'),
from pybel.dsl import ( Abundance, BiologicalProcess, ComplexAbundance, NamedComplexAbundance, Pathology, Protein, ProteinModification, ) from pybel.language import activity_mapping from pybel.testing.constants import test_jgif_path from tests.constants import TestGraphMixin logging.getLogger('pybel.parser').setLevel(20) calcium = Abundance('SCHEM', 'Calcium') calcineurin_complex = NamedComplexAbundance('SCOMP', 'Calcineurin Complex') foxo3 = Protein('HGNC', 'FOXO3') tcell_proliferation = BiologicalProcess( 'GO', 'CD8-positive, alpha-beta T cell proliferation') il15 = Protein('HGNC', 'IL15') il2rg = Protein('MGI', 'Il2rg') jgif_expected_nodes = { calcium, calcineurin_complex, foxo3, tcell_proliferation, il15, il2rg, Protein('HGNC', 'CXCR6'), Protein('HGNC', 'IL15RA'),
from pybel.typing import EdgeData def _rel(x): return {RELATION: x} def _rela(x, y=None): return {RELATION: x, OBJECT: activity(y)} def _assoc(y): return {RELATION: ASSOCIATION, 'association_type': y} a1 = Abundance('CHEBI', '1') p1 = Protein('HGNC', '1') pf1 = Protein('INTERPRO', '1') d1 = Pathology('MESH', '1') b1 = BiologicalProcess('GO', '1') b2 = BiologicalProcess('GO', '2') m1 = MicroRna('MIRBASE', '1') r1 = Rna('HGNC', '1') r2 = Rna('HGNC', '2') nca1 = NamedComplexAbundance('FPLX', '1') pop1 = Population('taxonomy', '1') p2 = Protein('HGNC', identifier='9236') p3 = Protein('HGNC', identifier='9212') r3 = p3.get_rna() g3 = r3.get_gene()
def get_neurommsig_bel( df: pd.DataFrame, disease: str, nift_values: Mapping[str, str], ) -> BELGraph: """Generate the NeuroMMSig BEL graph. :param df: :param disease: :param nift_values: a dictionary of lower-cased to normal names in NIFT """ missing_features = set() fixed_caps = set() nift_value_originals = set(nift_values.values()) graph = BELGraph( name=f'NeuroMMSigDB for {disease}', description=f'SNP and Clinical Features for Subgraphs in {disease}', authors= 'Daniel Domingo-Fernández, Charles Tapley Hoyt, Mufassra Naz, Aybuge Altay, Anandhi Iyappan', contact='*****@*****.**', version=time.strftime('%Y%m%d'), ) for pathway, pathway_df in df.groupby(PATHWAY_COLUMN_NAME): sorted_pathway_df = pathway_df.sort_values(GENE_COLUMN_NAME) sliced_df = sorted_pathway_df[columns].itertuples() for _, gene, pubmeds, lit_snps, gwas_snps, ld_block_snps, clinical_features, clinical_snps in sliced_df: gene = ensure_quotes(gene) for snp in itt.chain(lit_snps or [], gwas_snps or [], ld_block_snps or [], clinical_snps or []): if not snp.strip(): continue graph.add_association( Gene('HGNC', gene), Gene('DBSNP', snp), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) for clinical_feature in clinical_features or []: if not clinical_feature.strip(): continue if clinical_feature.lower() not in nift_values: missing_features.add(clinical_feature) continue if clinical_feature not in nift_value_originals: fixed_caps.add((clinical_feature, nift_values[clinical_feature.lower()])) clinical_feature = nift_values[ clinical_feature.lower()] # fix capitalization graph.add_association( Gene('HGNC', gene), Abundance('NIFT', clinical_feature), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) if clinical_snps: for clinical_snp in clinical_snps: graph.add_association( Gene('DBSNP', clinical_snp), Abundance('NIFT', clinical_feature), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) if missing_features: logger.warning('Missing Features in %s', disease) for feature in missing_features: logger.warning(feature) if fixed_caps: logger.warning('Fixed capitalization') for broken, fixed in fixed_caps: logger.warning('%s -> %s', broken, fixed) return graph
def as_cas_bel(self) -> Abundance: """Get this drug as a PyBEL abundance identified by its CAS identifier.""" # https://www.ebi.ac.uk/miriam/main/datatypes/MIR:00000237 return Abundance(namespace='cas', identifier=self.cas_number)
# -*- coding: utf-8 -*- from bio2bel_hmdb.enrich import * from pybel import BELGraph from pybel.dsl import Abundance, Pathology, Protein from tests.constants import DatabaseMixin hmdb_tuple1 = Abundance('HMDB', 'HMDB00008') protein_tuple = Protein('UP', 'P50440') # test enriching with tissues hmdb_tuple2 = Abundance('HMDB', 'HMDB00064') disease_tuple = Pathology('HMDB_D', 'Lung Cancer') class TestEnrich(DatabaseMixin): def test_enrich_metabolites_proteins(self): g = BELGraph() g.add_node_from_data(hmdb_tuple1) self.assertEqual(1, g.number_of_nodes()) self.assertEqual(0, g.number_of_edges()) enrich_metabolites_proteins(g, self.manager) self.assertEqual(4, g.number_of_nodes()) self.assertEqual(3, g.number_of_edges()) self.assertTrue(g.has_edge(protein_tuple, hmdb_tuple1)) def test_enrich_metabolites_diseases(self): g = BELGraph() g.add_node_from_data(hmdb_tuple2)
def test_abundance_as_bel(self): """Test converting an abundance to BEL with a name that does not need quotation.""" namespace, name = 'HGNC', 'YFG' node = Abundance(namespace=namespace, name=name) self.assertEqual('a(HGNC:YFG)', node.as_bel())
def as_drugbank_bel(self) -> Abundance: """Get this drug as a PyBEL abundance identified by its DrugBank identifier.""" return Abundance(namespace=MODULE_NAME, name=self.name, identifier=self.drugbank_id)
def test_abundance_as_bel_quoted(self): """Test converting an abundance to BEL with a name that needs quotation.""" namespace, name = 'HGNC', 'YFG-1' node = Abundance(namespace=namespace, name=name) self.assertEqual('a(HGNC:"YFG-1")', node.as_bel())
def test_as_tuple(self): namespace, name = n(), n() node = Abundance(namespace=namespace, name=name) self.assertEqual(hash(node), hash(node.as_bel()))
def test_abundance_as_no_quotes(self): """Test converting an abundance that doesn't need quotes, but looks crazy.""" namespace, name = 'a-c', 'd.e.f' node = Abundance(namespace=namespace, name=name) self.assertEqual('a(a-c:d.e.f)', node.as_bel())
g1 = Gene(namespace=HGNC, name='1') r1 = Rna(namespace=HGNC, name='1') p1 = Protein(HGNC, name='1') g2 = Gene(HGNC, name='2') r2 = Rna(HGNC, name='2') p2 = Protein(HGNC, name='2') g3 = Gene(namespace=HGNC, name='3') r3 = Rna(namespace=HGNC, name='3') p3 = Protein(namespace=HGNC, name='3') g4 = Gene(namespace=HGNC, name='4') m4 = MicroRna(namespace=HGNC, name='4') a5 = Abundance(namespace=CHEBI, name='5') p5 = Pathology(namespace=GO, name='5') class TestCollapseProteinInteractions(unittest.TestCase): def test_protein_interaction_1(self): graph = BELGraph() graph.add_node_from_data(p1) graph.add_node_from_data(p2) graph.add_node_from_data(a5) graph.add_node_from_data(p5) graph.add_qualified_edge(p1, p2, relation=POSITIVE_CORRELATION, citation=n(), evidence=n()) graph.add_qualified_edge(p1, p2, relation=INCREASES, citation=n(), evidence=n())
def as_smiles_bel(self) -> Abundance: """Get this drug as a PyBEL abundance identified by SMILES.""" return Abundance(namespace='smiles', identifier=self.smiles)
def test_reaction(self): node = Reaction(reactants=[Abundance(namespace='CHEBI', name='A')], products=[Abundance(namespace='CHEBI', name='B')]) self.assertEqual('rxn(reactants(a(CHEBI:A)), products(a(CHEBI:B)))', str(node))
def as_inchikey_bel(self) -> Abundance: """Get this drug as a PyBEL abundance identified by InChI-key.""" # https://www.ebi.ac.uk/miriam/main/datatypes/MIR:00000387 return Abundance(namespace='inchikey', identifier=self.inchikey)
def as_pubchem_compound_bel(self) -> Abundance: """Get this drug as a PyBEL abundance identified by PubChem.""" return Abundance(namespace='pubchem.compound', identifier=self.pubchem_compound_id)
def normalize_graph_names(graph: BELGraph, database: str) -> None: """Normalize graph names.""" # Victim to Survivor (one to one node) mapping one_to_one_mapping = {} # Victim to Survivors (one to many nodes) mapping one_to_many_mapping = defaultdict(set) for node in graph.nodes(): # Skip ListAbundances and Reactions since they do not have a name if isinstance(node, ListAbundance) or isinstance( node, Reaction) or not node.name: continue # Normalize names: Lower case name and strip quotes or white spaces lower_name = node.name.lower().strip('"').strip() # Dealing with Genes/miRNAs if isinstance(node, CentralDogma): ################## # miRNA entities # ################## if lower_name.startswith("mir"): # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: one_to_many_mapping[node].add( MicroRna( node.namespace, name=lower_name.replace("mir-", "mir"), identifier=node.identifier, ), ) if lower_name.endswith(' genes'): lower_name = lower_name[:-len(' genes')] elif lower_name.endswith(' gene'): lower_name = lower_name[:-len(' gene')] one_to_one_mapping[node] = MicroRna( node.namespace, name=lower_name.replace( "mir-", "mir"), # Special case for Reactome ) continue # KEGG and Reactome one_to_one_mapping[node] = MicroRna( node.namespace, name=node.name.replace("mir-", "mir"), identifier=node.identifier, ) ################## # Genes entities # ################## else: # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: if lower_name in BLACK_LIST_REACTOME: # Filter entities in black list continue elif lower_name.startswith( "("): # remove redundant parentheses lower_name = lower_name.strip("(").strip(")") one_to_many_mapping[node].add( Protein(node.namespace, name=lower_name, identifier=node.identifier), ) else: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier) continue # WikiPathways and KEGG do not require any processing of genes elif database == WIKIPATHWAYS and lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue one_to_one_mapping[node] = Protein(node.namespace, name=lower_name, identifier=node.identifier) ####################### # Metabolite entities # ####################### elif isinstance(node, Abundance): if database == 'wikipathways': # Biological processes that are captured as abundance in # BEL since they were characterized wrong in WikiPathways if lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Abundances to BiologicalProcesses elif (node.namespace in {'WIKIDATA', 'WIKIPATHWAYS', 'REACTOME'} and lower_name not in WIKIPATHWAYS_METAB): one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Fix naming in duplicate entity if lower_name in WIKIPATHWAYS_NAME_NORMALIZATION: lower_name = WIKIPATHWAYS_NAME_NORMALIZATION[lower_name] elif database == REACTOME: # Curated proteins that were coded as metabolites if lower_name in REACTOME_PROT: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Flat multiple identifiers (this is not trivial because most of ChEBI names contain commas, # so a clever way to fix some of the entities is to check that all identifiers contain letters) elif "," in lower_name and all( string.isalpha() for string in lower_name.split(",")): for string in lower_name.split(","): one_to_many_mapping[node].add( Abundance(node.namespace, name=string, identifier=node.identifier), ) continue one_to_one_mapping[node] = Abundance(node.namespace, name=lower_name, identifier=node.identifier) ################################# # Biological Processes entities # ################################# elif isinstance(node, BiologicalProcess): # KEGG normalize name by removing the title prefix if lower_name.startswith('title:'): lower_name = lower_name[len('title:'):] one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) relabel_nodes(graph, one_to_one_mapping) multi_relabel(graph, one_to_many_mapping)