def get_common_genes(disease_pairs, networks, writing_files): new_networks = [] for index, disease_pair in enumerate(disease_pairs): network = networks[index] d1_genes, d2_genes = get_genes(disease_pair) common_genes = d1_genes.intersection(d2_genes) d1 = Disease([disease_pair[0]], []) network.add_node(d1) d2 = Disease([disease_pair[1]], []) network.add_node(d2) for g_id in common_genes: gene = Gene([g_id], []) network.add_node(gene) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) if len(common_genes) > 0 and writing_files: temp_id1 = disease_pair[0].replace(':', '-') temp_id2 = disease_pair[1].replace(':', '-') path = '../analysis/disease_pairs/' + temp_id1 + '_' + temp_id2 try: os.mkdir(path) except FileExistsError: pass with io.open(path + '/' + temp_id1 + '_' + temp_id2 + '_common_genes.tsv', 'w', encoding='utf-8', newline='') as common_genes_file: common_genes_file.write('#Common genes of ' + disease_pair[0] + ' and ' + disease_pair[1] + '\n') for gene in common_genes: common_genes_file.write(gene + '\n') new_networks.append(network) print('Done getting genes') return new_networks
def get_common_variants(disease_pairs, networks, writing_files): new_networks = [] for index, disease_pair in enumerate(disease_pairs): network = networks[index] d1 = Disease([disease_pair[0]], []) network.add_node(d1) d2 = Disease([disease_pair[1]], []) network.add_node(d2) common_variants = [] # each variant is an array query = """ MATCH (d1:Disease)--(v:Variant)--(d2:Disease) WHERE {d1_id} in d1.ids AND {d2_id} in d2.ids RETURN v.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: v_id = result['v.`_id`'] common_variants.append([v_id, 'disease associated']) variant = Variant([v_id], []) network.add_node(variant) network.add_edge(Edge(d1, variant, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(d2, variant, 'ASSOCIATES_WITH', {})) # variants associated to common genes d1_genes, d2_genes = get_genes(disease_pair) common_genes_ids = d1_genes.intersection(d2_genes) for gene_id in common_genes_ids: query = """ MATCH (g:Gene)-[a]-(v:Variant) WHERE {g_id} in g.ids RETURN v.`_id`, type(a) """ results = session.run(query, parameters={'g_id': gene_id}) for result in results: v_id = result['v.`_id`'] type = result['type(a)'] # can be CODES or EQTL variant_pair = v_id + '-' + gene_id common_variants.append([variant_pair, 'gene associated']) variant = Variant([v_id], []) network.add_node(variant) gene = Gene([gene_id], []) network.add_node(gene) network.add_edge(Edge(gene, variant, type, {})) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) new_networks.append(network) if len(common_variants) > 0 and writing_files: temp_id1 = disease_pair[0].replace(':', '-') temp_id2 = disease_pair[1].replace(':', '-') path = '../analysis/disease_pairs/' + temp_id1 + '_' + temp_id2 try: os.mkdir(path) except FileExistsError: pass with io.open(path + '/' + temp_id1 + '_' + temp_id2 + '_common_variants.tsv', 'w', encoding='utf-8', newline='') as common_variants_file: common_variants_file.write( '#Common variants associated with ' + disease_pair[0] + ' and ' + disease_pair[1] + '\n') for variant in common_variants: common_variants_file.write(variant[0] + '\t' + variant[1] + '\n') print('Done getting variants') return new_networks
def new_gene(self, size: int, value_bounds: tuple, name: str): if name in self.genes: raise Exception("gene with this name already present in genome !") val_plusminus = math.fabs(value_bounds[0] - value_bounds[1]) / 2 val_midpoint = value_bounds[0] + val_plusminus new_values = [] for i in range(size): sign = 1 if random.random() >= 0.5 else -1 val = val_midpoint + sign * val_plusminus * random.random() new_values.append(val) new_gene = Gene(val_type=float, geneID=name, initial_values=new_values, value_bounds=(value_bounds[0], value_bounds[1])) self.genes[name] = new_gene
def setUp(self): geneA_vals = ['A', 'B', 'C', 'D', 'E', 'F'] self.geneA_u = Gene(val_type=str, geneID="test_gene", initial_values=geneA_vals, static_values=True) geneB_vals = ['B', 'F', 'C', 'A', 'D', 'E'] self.geneB_u = Gene(val_type=str, geneID="test_gene", initial_values=geneB_vals, static_values=True) geneC_vals = [0, 1, 2, 3, 4, 5] self.geneC_u = Gene(val_type=str, geneID="test_gene2", initial_values=geneC_vals, static_values=True) geneD_vals = [0.3, 0.5, 0.2, 0.8, 0.98, 0.05] self.geneD = Gene(val_type=str, geneID="test_gene3", initial_values=geneD_vals) geneE_vals = [0.13, 0.1, 0.52, 0.98, 0.25, 0.65] self.geneE = Gene(val_type=str, geneID="test_gene3", initial_values=geneE_vals)
def get_common_rnas(disease_pairs, networks, writing_files): new_networks = [] for index, disease_pair in enumerate(disease_pairs): network = networks[index] d1 = Disease([disease_pair[0]], []) network.add_node(d1) d2 = Disease([disease_pair[1]], []) network.add_node(d2) d1_genes_ids, d2_genes_ids = get_genes(disease_pair) # this differentiation is done to get the correct number of regulated, in this subgraph present genes d1_only_genes_ids = d1_genes_ids.difference(d2_genes_ids) d2_only_genes_ids = d2_genes_ids.difference(d1_genes_ids) common_genes_ids = d1_genes_ids.intersection(d2_genes_ids) common_rnas = {} #dict with the RNA name as key and the regulated genes as an array as value for gene_id in common_genes_ids: query = """ MATCH (g:Gene)-[:REGULATES]-(r:RNA) WHERE {gene_id} IN g.ids RETURN distinct(r.`_id`) """ results = session.run(query, parameters={'gene_id': gene_id}) for result in results: rna_id = result['(r.`_id`)'] if rna_id in common_rnas: gene_ids = common_rnas[rna_id] gene_ids.append(gene_id) common_rnas[rna_id] = gene_ids else: common_rnas[rna_id] = [gene_id] gene = Gene([gene_id], []) network.add_node(gene) rna = RNA([rna_id], []) network.add_node(rna) network.add_edge(Edge(rna, gene, 'REGULATES', {})) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) rnas_d1_only_genes = {} for gene_id in d1_only_genes_ids: query = """ MATCH (g:Gene)-[:REGULATES]-(r:RNA) WHERE {gene_id} IN g.ids RETURN distinct(r.`_id`) """ results = session.run(query, parameters={'gene_id': gene_id}) for result in results: rna_id = result['(r.`_id`)'] if rna_id in rnas_d1_only_genes: gene_ids = rnas_d1_only_genes[rna_id] gene_ids.append(gene_id) rnas_d1_only_genes[rna_id] = gene_ids else: rnas_d1_only_genes[rna_id] = [gene_id] rnas_d2_only_genes = {} for gene_id in d2_only_genes_ids: query = """ MATCH (g:Gene)-[:REGULATES]-(r:RNA) WHERE {gene_id} IN g.ids RETURN distinct(r.`_id`) """ results = session.run(query, parameters={'gene_id': gene_id}) for result in results: rna_id = result['(r.`_id`)'] if rna_id in rnas_d2_only_genes: gene_ids = rnas_d2_only_genes[rna_id] gene_ids.append(gene_id) rnas_d2_only_genes[rna_id] = gene_ids else: rnas_d2_only_genes[rna_id] = [gene_id] #common_rnas = {'A':1, 'B':1, 'D':1} #rnas_d1_only_genes = {'A':2, 'B':1, 'E':1} #rnas_d2_only_genes = {'A':2, 'C':1, 'E':1} for rna_id in rnas_d1_only_genes: if rna_id in common_rnas: # common_rnas have already been added to the network, here the number of regulated genes is updated common_rnas[rna_id] = common_rnas[rna_id] + rnas_d1_only_genes[rna_id] elif rna_id in rnas_d2_only_genes: # RNA regulates genes associated to d1 and genes associated to d2, RNA does not regulate a common gene common_rnas[rna_id] = rnas_d1_only_genes[rna_id] + rnas_d2_only_genes[rna_id] g1_ids = rnas_d1_only_genes[rna_id] g2_ids = rnas_d2_only_genes[rna_id] rna = RNA([rna_id], []) network.add_node(rna) for g_id in g1_ids: gene = Gene([g_id], []) network.add_node(gene) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(rna, gene, 'REGULATES', {})) for g_id in g2_ids: gene = Gene([g_id], []) network.add_node(gene) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(rna, gene, 'REGULATES', {})) del rnas_d2_only_genes[rna_id] for rna_id in rnas_d2_only_genes: if rna_id in common_rnas: # common_rnas have already been added to the network, here the number of regulated genes is updated common_rnas[rna_id] = common_rnas[rna_id] + rnas_d2_only_genes[rna_id] # for each RNA add an array of RNAs, which regulate this RNA. MRNAs are not included for rna_id in common_rnas: second_rnas = [] query = """MATCH (r:RNA)-[:REGULATES]-(n:RNA) WHERE {r_id} IN r.ids AND NOT n.label_id CONTAINS "MRNA" RETURN distinct(n.`_id`) """ results = session.run(query, parameters={'r_id': rna_id}) rna = RNA([rna_id], []) network.add_node(rna) for result in results: second_rna_id = result['(n.`_id`)'] second_rnas.append(second_rna_id) second_rna = RNA([second_rna_id], []) network.add_node(second_rna) network.add_edge(Edge(second_rna, rna, 'REGULATES', {})) # the value of common_rnas is now changed to an array where at the first position the array with the regulated # genes from this subgraph is stored and at the second position the array with RNAs regulating the RNA is stored common_rnas[rna_id] = [common_rnas[rna_id], second_rnas] new_networks.append(network) if len(common_rnas) > 0 and writing_files: temp_id1 = disease_pair[0].replace(':', '-') temp_id2 = disease_pair[1].replace(':', '-') path = '../analysis/disease_pairs/' + temp_id1 + '_' + temp_id2 try: os.mkdir(path) except FileExistsError: pass with io.open(path + '/' + temp_id1 + '_' + temp_id2 + '_common_rnas.tsv', 'w', encoding='utf-8', newline='') as common_rnas_file: common_rnas_file.write('#Common rnas of ' + disease_pair[0] + ' and ' + disease_pair[1] + '\tsorted by number of regulated genes\tRegulated genes\tRNAs regulating the RNA\n') for key, value in sorted(common_rnas.items(), key=lambda item: len(item[1][0]), reverse=True): # sort by the number of genes in this subgraph which are regulated by the RNA regulated_genes = str(value[0]) regulated_genes = regulated_genes.replace('[', '') regulated_genes = regulated_genes.replace(']', '') regulated_genes = regulated_genes.replace('\'', '') second_rnas = str(value[1]) second_rnas = second_rnas.replace('[', '') second_rnas = second_rnas.replace(']', '') second_rnas = second_rnas.replace('\'', '') common_rnas_file.write(key + '\t' + str(len(value[0])) + '\t' + regulated_genes + '\t' + second_rnas + '\n') print('Done getting RNAs') return new_networks
def get_common_drugs(disease_pairs, networks, writing_files): new_networks = [] for index, disease_pair in enumerate(disease_pairs): network = networks[index] d1 = Disease([disease_pair[0]], []) network.add_node(d1) d2 = Disease([disease_pair[1]], []) network.add_node(d2) # the drug INDICATES, CONTRAINDICATES or INDUCES both diseases common_drugs = set() query = """ MATCH (d1:Disease)-[a]-(n:Drug)--(d2:Disease) WHERE {d1_id} IN d1.ids AND {d2_id} IN d2.ids RETURN distinct(type(a)), n.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: drug_id = result['n.`_id`'] type = result['(type(a))'] common_drugs.add(drug_id) drug = Drug([drug_id], []) network.add_node(drug) network.add_edge(Edge(drug, d1, type, {})) query = """ MATCH (d1:Disease)--(n:Drug)-[a]-(d2:Disease) WHERE {d1_id} IN d1.ids AND {d2_id} IN d2.ids RETURN distinct(type(a)), n.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: drug_id = result['n.`_id`'] type = result['(type(a))'] common_drugs.add(drug_id) drug = Drug([drug_id], []) network.add_node(drug) network.add_edge(Edge(drug, d2, type, {})) # the drug targets a gene of one disease and is associated to the other disease query = """ MATCH (d1:Disease)-[a]-(n:Drug)-[:TARGETS]-(g:Gene)-[:ASSOCIATES_WITH]-(d2:Disease) WHERE {d1_id} IN d1.ids AND {d2_id} IN d2.ids RETURN distinct(type(a)), n.`_id`, g.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: drug_id = result['n.`_id`'] type = result['(type(a))'] common_drugs.add(drug_id) drug = Drug([drug_id], []) network.add_node(drug) network.add_edge(Edge(drug, d1, type, {})) gene_id = result['g.`_id`'] gene = Gene([gene_id], []) network.add_node(gene) network.add_edge(Edge(drug, gene, 'TARGETS', {'actions': []})) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) query = """ MATCH (d2:Disease)-[a]-(n:Drug)-[:TARGETS]-(g:Gene)-[:ASSOCIATES_WITH]-(d1:Disease) WHERE {d1_id} IN d1.ids AND {d2_id} IN d2.ids RETURN distinct(type(a)), n.`_id`, g.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: drug_id = result['n.`_id`'] type = result['(type(a))'] common_drugs.add(drug_id) drug = Drug([drug_id], []) network.add_node(drug) network.add_edge(Edge(drug, d2, type, {})) gene_id = result['g.`_id`'] gene = Gene([gene_id], []) network.add_node(gene) network.add_edge(Edge(drug, gene, 'TARGETS', {'actions': []})) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) # the drug targets one gene which is associated to both diseases or the drug targets two different genes # where each gene is associated to one of the diseases query = """ MATCH (d1:Disease)-[:ASSOCIATES_WITH]-(g1:Gene)-[:TARGETS]-(n:Drug)-[:TARGETS]-(g2:Gene)- [:ASSOCIATES_WITH]-(d2:Disease) WHERE {d1_id} IN d1.ids AND {d2_id} IN d2.ids RETURN n.`_id`, g1.`_id`, g2.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for result in results: drug_id = result['n.`_id`'] common_drugs.add(drug_id) g1_id = result['g1.`_id`'] g2_id = result['g2.`_id`'] g1 = Gene([g1_id], []) network.add_node(g1) network.add_edge(Edge(g1, d1, 'ASSOCIATES_WITH', {})) drug = Drug([drug_id], []) network.add_node(drug) network.add_edge(Edge(drug, g1, 'TARGETS', {'actions': []})) g2 = Gene([g2_id], []) network.add_node(g2) network.add_edge(Edge(drug, g2, 'TARGETS', {'actions': []})) network.add_edge(Edge(g2, d2, 'ASSOCIATES_WITH', {})) new_networks.append(network) if len(common_drugs) > 0 and writing_files: temp_id1 = disease_pair[0].replace(':', '-') temp_id2 = disease_pair[1].replace(':', '-') path = '../analysis/disease_pairs/' + temp_id1 + '_' + temp_id2 try: os.mkdir(path) except FileExistsError: pass with io.open(path + '/' + temp_id1 + '_' + temp_id2 + '_common_drugs.tsv', 'w', encoding='utf-8', newline='') as common_drugs_file: common_drugs_file.write('#Common drugs of ' + disease_pair[0] + ' and ' + disease_pair[1] + '\n') for drug in common_drugs: common_drugs_file.write(drug + '\n') print('Done getting drugs') return new_networks
network = Network() # 0 Location # 1 Phenotype # 2 Phenotype MIM number # 3 Inheritance # 4 Phenotype mapping key # 5 Gene/Locus # 6 Gene/Locus MIM number with io.open('../data/OMIM/filtered_associations.csv', 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter=',', quotechar='"') next(reader, None) for row in reader: disease = Disease(['OMIM:%s' % row[2]], []) network.add_node(disease) gene = Gene(['HGNC:%s' % row[5]], []) # , 'OMIM:%s' % row[6] network.add_node(gene) rel = { 'source': 'OMIM', 'location': row[0], 'phenotype': row[1], 'inheritance': row[2], 'phenotype_mapping_key': row[4] } network.add_edge(Edge(gene, disease, 'ASSOCIATES_WITH', rel)) network.save('../data/OMIM/graph.json')
import io import csv from model.network import Network from model.disease import Disease from model.gene import Gene file = '../data/HPO/OMIM_FREQUENT_FEATURES_diseases_to_genes_to_phenotypes.txt' url = 'http://compbio.charite.de/jenkins/job/hpo.annotations.monthly/lastSuccessfulBuild/artifact/annotation/' + \ 'OMIM_FREQUENT_FEATURES_diseases_to_genes_to_phenotypes.txt' if not os.path.exists(file): print('Database does not exist. Trying to download...') with urllib.request.urlopen(url) as response, open(file, 'wb') as f: f.write(response.read()) network = Network() with io.open(file, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: disease = Disease([row[0]], []) network.add_node(disease) gene = Gene(['HGNC:%s' % row[1], 'Entrez:%s' % row[2]], []) network.add_node(gene) hpo_id = row[3] hpo_term_name = row[4] # TODO network.save('../data/HPO/graph.json')
def add_rna(name, type, node_lookup): key = name + '$' + type if key in node_lookup.keys(): node = node_lookup[key] return node else: if type == 'mRNA' or type == 'DNA' or type == 'TF' or type == 'protein' or type == 'RBP': interactor_id = check_hgnc_id(name) elif type == 'miRNA': interactor_id = get_mirna_id(name) else: rnacentral_id, interactor_id = get_rna_ids(name) if rnacentral_id == 'None': interactor_id = check_hgnc_id(name) if interactor_id != 'None': if type == 'DNA' or type == 'TF' or type == 'protein' or type == 'RBP': node = Gene([interactor_id], []) network.add_node(node) elif type == 'miRNA': node = MiRNA([interactor_id], [name]) network.add_node(node) elif type == 'mRNA': node = MRNA([interactor_id], []) network.add_node(node) else: if rnacentral_id == 'None': if type == 'circRNA': node = CircRNA([interactor_id], []) elif type == 'eRNA': node = ERNA([interactor_id], []) elif type == 'lncRNA': node = LncRNA([interactor_id], []) elif type == 'ncRNA': node = NcRNA([interactor_id], []) elif type == 'piRNA': node = PiRNA([interactor_id], []) elif type == 'pseudo': node = Pseudogene([interactor_id], []) elif type == 'ribozyme': node = Ribozyme([interactor_id], []) elif type == 'rRNA': node = RRNA([interactor_id], []) elif type == 'scaRNA': node = ScaRNA([interactor_id], []) elif type == 'scRNA': node = ScRNA([interactor_id], []) elif type == 'snoRNA': node = SnoRNA([interactor_id], []) elif type == 'snRNA': node = SnRNA([interactor_id], []) else: node = RNA([interactor_id], []) network.add_node(node) else: if type == 'circRNA': node = CircRNA([rnacentral_id, interactor_id], []) elif type == 'eRNA': node = ERNA([rnacentral_id, interactor_id], []) elif type == 'lncRNA': node = LncRNA([rnacentral_id, interactor_id], []) elif type == 'ncRNA': node = NcRNA([rnacentral_id, interactor_id], []) elif type == 'piRNA': node = PiRNA([rnacentral_id, interactor_id], []) elif type == 'pseudo': node = Pseudogene([rnacentral_id, interactor_id], []) elif type == 'ribozyme': node = Ribozyme([rnacentral_id, interactor_id], []) elif type == 'rRNA': node = RRNA([rnacentral_id, interactor_id], []) elif type == 'scaRNA': node = ScaRNA([rnacentral_id, interactor_id], []) elif type == 'scRNA': node = ScRNA([rnacentral_id, interactor_id], []) elif type == 'snoRNA': node = SnoRNA([rnacentral_id, interactor_id], []) elif type == 'snRNA': node = SnRNA([rnacentral_id, interactor_id], []) else: node = RNA([rnacentral_id, interactor_id], []) network.add_node(node) node_lookup[key] = node return node else: return None
class TestGene(unittest.TestCase): def setUp(self): geneA_vals = ['A', 'B', 'C', 'D', 'E', 'F'] self.geneA_u = Gene(val_type=str, geneID="test_gene", initial_values=geneA_vals, static_values=True) geneB_vals = ['B', 'F', 'C', 'A', 'D', 'E'] self.geneB_u = Gene(val_type=str, geneID="test_gene", initial_values=geneB_vals, static_values=True) geneC_vals = [0, 1, 2, 3, 4, 5] self.geneC_u = Gene(val_type=str, geneID="test_gene2", initial_values=geneC_vals, static_values=True) geneD_vals = [0.3, 0.5, 0.2, 0.8, 0.98, 0.05] self.geneD = Gene(val_type=str, geneID="test_gene3", initial_values=geneD_vals) geneE_vals = [0.13, 0.1, 0.52, 0.98, 0.25, 0.65] self.geneE = Gene(val_type=str, geneID="test_gene3", initial_values=geneE_vals) @patch('model.gene.random.randrange') @patch('model.gene.random.random') def test_staticValues_mutation_1(self, mf_random, mf_randrange): # setup mf_randrange.side_effect = [0, 1, 1, 4] # specifying swap indicies mf_random.side_effect = [0, 0, 1] # determines how many swaps are done (2) expected_result = ['B', 'E', 'C', 'D', 'A', 'F'] # test self.geneA_u.mutate(0.5) self.assertEqual(expected_result, self.geneA_u.values) @patch('model.gene.random.randrange') @patch('model.gene.random.random') def test_values_mutation_1(self, mf_random, mf_randrange): # setup mf_randrange.side_effect = [0, 2, 4] # indices mf_random.side_effect = [ 0, 0, 0, 1, # setting number of mutations 0, 0.5, # sign and magnitude of change to value 0, 1, 1, 1 ] expected_result = [0.275, 0.5, 0.15, 0.8, 1, 0.05] # test self.geneD.mutate(0.5) self.assertEqual(expected_result, [round(x, 3) for x in self.geneD.values]) @patch('model.gene.random.randrange') @patch('model.gene.random.random') def test_values_mutation_2(self, mf_random, mf_randrange): # setup mf_randrange.side_effect = [0, 2, 4] # indices expected_result_1_A = [0.13, 0.1, 0.52, 0.98, 0.25, 0.65] expected_result_1_B = [0.3, 0.5, 0.2, 0.8, 0.98, 0.05] expected_result_2_A = [0.3, 0.5, 0.52, 0.98, 0.25, 0.65] expected_result_2_B = [0.13, 0.1, 0.2, 0.8, 0.98, 0.05] expected_result_3_A = [0.3, 0.5, 0.2, 0.8, 0.25, 0.65] expected_result_3_B = [0.13, 0.1, 0.52, 0.98, 0.98, 0.05] # test genes = self.geneD.crossover_singlepoint(self.geneE) self.assertEqual(expected_result_1_A, genes[0].values) self.assertEqual(expected_result_1_B, genes[1].values) genes = self.geneD.crossover_singlepoint(self.geneE) self.assertEqual(expected_result_2_A, genes[0].values) self.assertEqual(expected_result_2_B, genes[1].values) genes = self.geneD.crossover_singlepoint(self.geneE) self.assertEqual(expected_result_3_A, genes[0].values) self.assertEqual(expected_result_3_B, genes[1].values)
# 11 - Has CPIC Dosing Guideline # 12 - Chromosome # 13 - Chromosomal Start - GRCh37.p13 # 14 - Chromosomal Stop - GRCh37.p13 # 15 - Chromosomal Start - GRCh38.p7 # 16 - Chromosomal Stop - GRCh38.p7 for row in reader: gene_ids = {'PharmGKB:%s' % row[0]} if row[2]: gene_ids.add('HGNC:%s' % row[2]) for ensembl_id in split_list(row[3]): gene_ids.add('Ensembl:%s' % ensembl_id) if row[5]: gene_ids.add('HGNC:%s' % row[5]) gene_ids.update(process_gene_cross_references(split_list(row[10]))) gene = Gene(gene_ids, [row[4]]) network.add_node(gene) with open_file_in_zip('../data/PharmGKB/variants.zip', 'variants.tsv') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: variant_ids = {'PharmGKB:%s' % row[0]} if row[1]: variant_ids.add('dbSNP:%s' % row[1]) variant = Variant(variant_ids, []) variant.attributes['location'] = row[4] network.add_node(variant) if row[2] and len(row[2]) > 0: for gene_id in [ 'PharmGKB:%s' % x.strip() for x in row[2].split(',')
id_mapping_file = '../data/UniprotKB/HUMAN_9606_idmapping.dat' id_mapping_zip_file = '../data/UniprotKB/HUMAN_9606_idmapping.dat.gz' id_mapping_url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz' if not os.path.exists(id_mapping_file): print('Database does not exist. Trying to download and extract...') if not os.path.exists(id_mapping_zip_file): print('Downloading latest archive...') with urllib.request.urlopen(id_mapping_url) as response, open( id_mapping_zip_file, 'wb') as f: f.write(response.read()) print('Extracting database file...') with gzip.open(id_mapping_zip_file, 'rb') as f: with open(id_mapping_file, 'wb') as out_file: out_file.write(f.read()) network = Network() with io.open(id_mapping_file, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') for row in reader: if row[1] == 'HGNC': gene = Gene(['UniProtKB:%s' % row[0], row[2]], []) network.add_node(gene) elif row[1] == 'Gene_Name': gene = Gene(['UniProtKB:%s' % row[0], 'HGNC:%s' % row[2]], []) network.add_node(gene) network.save('../data/UniprotKB/graph.json')
external_id_lookup = {} for row in external_id_results: external_id_lookup[row[0]] = [x for x in row[1::] if x] network = Network() for row in targets_results: drug_ids = ['DrugBank:%s' % row[0]] if row[0] in external_id_lookup: drug_ids.extend(external_id_lookup[row[0]]) drug = Drug(drug_ids, [row[1]]) network.add_node(drug) gene_ids = ['HGNC:%s' % row[2]] if row[4]: gene_ids.append(row[4]) gene = Gene(gene_ids, [row[3]]) network.add_node(gene) rel = { 'source': 'DrugBank', 'known_action': row[5] == 1, 'actions': row[6].split(',') if row[6] else [], 'simplified_action': row[7] } network.add_edge(Edge(drug, gene, 'TARGETS', rel)) for row in interactions_results: drug1 = Drug(['DrugBank:%s' % row[0]], [row[1]]) network.add_node(drug1) drug2 = Drug(['DrugBank:%s' % row[2]], [row[3]]) network.add_node(drug2) rel = { 'source': 'DrugBank',
if not os.path.exists(file): print('Database does not exist. Trying to download...') data = urllib.parse.urlencode({ 'downLoadType': 'all_pheno', 'Mysubmit': 'Download' }).encode() with urllib.request.urlopen(urllib.request.Request(url, data=data)) as response: with open(file, 'wb') as f: f.write(response.read()) network = Network() with io.open(file, 'r', encoding='utf-8', newline='') as f: for skip in range(0, 4): f.readline() for line in f: parts = [[y.strip() for y in x[:-1].split('(')] for x in line.strip().split('\t')] if len(parts) > 1: disease = Disease(['UMLS:%s' % parts[0][1]], [parts[0][0]]) network.add_node(disease) for part in parts[1::]: gene = Gene(['HGNC:%s' % part[0], 'Entrez:%s' % part[1]], [part[0]]) network.add_node(gene) rel = {'source': 'HuGE Navigator'} network.add_edge(Edge(gene, disease, 'ASSOCIATES_WITH', rel)) network.save('../data/HuGE-Navigator/graph.json')
def get_given_drugs_related_info(disease_pairs, drugs): # first disease pair with first drug array all_networks = [] # contains an array for each disease pair for index, disease_pair in enumerate(disease_pairs): networks_per_drug = [] # contains a network for each drug pair_drugs_ids = drugs[index] temp_id1 = disease_pair[0].replace(':', '-') temp_id2 = disease_pair[1].replace(':', '-') path = '../analysis/disease_pairs/' + temp_id1 + '_' + temp_id2 for drug_id in pair_drugs_ids: try: os.mkdir(path) except FileExistsError: pass network = Network() d1 = Disease([disease_pair[0]], []) network.add_node(d1) d2 = Disease([disease_pair[1]], []) network.add_node(d2) drug = Drug([drug_id], []) network.add_node(drug) temp_drug_id = drug_id.replace(':', '-') with io.open(path + '/' + temp_id1 + '_' + temp_id2 + '_' + temp_drug_id + '_results.txt', 'w', encoding='utf-8', newline='') as results_file: results_file.write('In this file all information about the connection between ' + disease_pair[0] + ' and ' + disease_pair[1] + ' and the drug ' + drug_id + ' is summarized:\n') # the drug INDICATES, CONTRAINDICATES or INDUCES the disease query = """ MATCH (d:Disease)-[a]-(n:Drug) WHERE {d1_id} IN d.ids AND {n_id} in n.ids RETURN distinct(type(a)) """ d1_results = session.run(query, parameters={'d1_id': disease_pair[0], 'n_id': drug_id}) for result in d1_results: results_file.write(drug_id + ' ' + result['(type(a))'] + ' ' + disease_pair[0] + '\n') network.add_edge(Edge(drug, d1, result['(type(a))'], {})) query = """ MATCH (d:Disease)-[a]-(n:Drug) WHERE {d2_id} IN d.ids AND {n_id} in n.ids RETURN distinct(type(a)) """ d2_results = session.run(query, parameters={'d2_id': disease_pair[1], 'n_id': drug_id}) for result in d2_results: results_file.write(drug_id + ' ' + result['(type(a))'] + ' ' + disease_pair[1] + '\n') network.add_edge(Edge(drug, d2, result['(type(a))'], {})) # the drug targets a gene which is associated to the disease d1_genes = set() query = """ MATCH (n:Drug)-[:TARGETS]-(g:Gene)-[:ASSOCIATES_WITH]-(d:Disease) WHERE {d1_id} IN d.ids AND {n_id} in n.ids RETURN g.`_id` """ d1_results = session.run(query, parameters={'d1_id': disease_pair[0], 'n_id': drug_id}) for gene in d1_results: d1_genes.add(gene['g.`_id`']) g = Gene([gene['g.`_id`']], []) network.add_node(g) network.add_edge(Edge(drug, g, 'TARGETS', {'actions': []})) #TODO network.add_edge(Edge(g, d1, 'ASSOCIATES_WITH', {})) d2_genes = set() query = """ MATCH (n:Drug)-[:TARGETS]-(g:Gene)-[:ASSOCIATES_WITH]-(d:Disease) WHERE {d2_id} IN d.ids AND {n_id} in n.ids RETURN g.`_id` """ d2_results = session.run(query, parameters={'d2_id': disease_pair[1], 'n_id': drug_id}) for gene in d2_results: d2_genes.add(gene['g.`_id`']) g = Gene([gene['g.`_id`']], []) network.add_node(g) network.add_edge(Edge(drug, g, 'TARGETS', {'actions': []})) #TODO network.add_edge(Edge(g, d2, 'ASSOCIATES_WITH', {})) common_drug_genes = d1_genes.intersection(d2_genes) # genes associated to the drug and both diseases # relevant_genes are all genes associated to at least one disease and the drug, below the common genes # with the most disease associated references are added relevant_genes = d1_genes.union(d2_genes) if len(d1_genes) > 0: nbr = str(len(d1_genes)) d1_genes = str(d1_genes) d1_genes = d1_genes.replace('{', '') d1_genes = d1_genes.replace('}', '') d1_genes = d1_genes.replace('\'', '') results_file.write(drug_id + ' targets following ' + nbr + ' genes which are associated to ' + disease_pair[0] + ': ' + d1_genes + '\n') if len(d2_genes) > 0: nbr = str(len(d2_genes)) d2_genes = str(d2_genes) d2_genes = d2_genes.replace('{', '') d2_genes = d2_genes.replace('}', '') d2_genes = d2_genes.replace('\'', '') results_file.write(drug_id + ' targets following ' + nbr + ' genes which are associated to ' + disease_pair[1] + ': ' + d2_genes + '\n') if len(common_drug_genes) > 0: nbr = str(len(common_drug_genes)) cdgs = str(common_drug_genes) cdgs = cdgs.replace('{', '') cdgs = cdgs.replace('}', '') cdgs = cdgs.replace('\'', '') results_file.write('The disease pair has ' + nbr + ' common genes which are targeted by the drug: ' + cdgs + '\n') # add the common genes with the most disease associated references # no given num_pmids is similar to num_pmids = 0 all_d1_genes, all_d2_genes = get_genes(disease_pair) all_common_genes = all_d1_genes.intersection(all_d2_genes) relevant_common_genes = [] # the genes with the most cited gene-disease association, threshold 10 if len(all_common_genes) > 0: results_file.write('The disease pair has ' + str(len(all_common_genes)) + ' common genes, not considering the connection to the drug.' ' Following genes have the most references regarding their connection to both diseases:\n') for gene in all_common_genes: query = """ MATCH (d1:Disease)-[a]-(g:Gene) WHERE {g_id} IN g.ids AND {d1_id} IN d1.ids RETURN a.num_pmids """ results = session.run(query, parameters={'g_id': gene, 'd1_id': disease_pair[0]}) num_pmids = 0 for result in results: # multiple edges to the same gene temp = result['a.num_pmids'] if temp is not None: num_pmids = num_pmids + temp query = """ MATCH (d2:Disease)-[a]-(g:Gene) WHERE {g_id} IN g.ids AND {d2_id} IN d2.ids RETURN a.num_pmids """ results = session.run(query, parameters={'g_id': gene, 'd2_id': disease_pair[1]}) for result in results: # multiple edges to the same gene temp = result['a.num_pmids'] if temp is not None: num_pmids = num_pmids + temp relevant_common_genes.append([gene, num_pmids]) # sort by number of pmids relevant_common_genes = sorted(relevant_common_genes, key=lambda item: item[1], reverse=True) relevant_common_genes = relevant_common_genes[:10] # threshold rcgs = str(relevant_common_genes) rcgs = rcgs[1:-1] rcgs = rcgs.replace('\'', '') results_file.write(rcgs + '\n') for g in relevant_common_genes: gene = Gene([g[0]], []) network.add_node(gene) network.add_edge(Edge(gene, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(gene, d2, 'ASSOCIATES_WITH', {})) relevant_genes.add(g[0]) # add the common disease associated variants with most references # no given num_pmids is similar to num_pmids = 0 disease_variants = {} query = """ MATCH (d1:Disease)-[a]-(v:Variant)--(d2:Disease) WHERE {d1_id} in d1.ids AND {d2_id} in d2.ids RETURN distinct(a.num_pmids), v.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for variant in results: num_pmids = variant['(a.num_pmids)'] if num_pmids is None: num_pmids = 0 var_id = variant['v.`_id`'] if var_id in disease_variants: temp = disease_variants[var_id] disease_variants[var_id] = temp + num_pmids else: disease_variants[var_id] = num_pmids query = """ MATCH (d2:Disease)-[a]-(v:Variant)--(d1:Disease) WHERE {d1_id} in d1.ids AND {d2_id} in d2.ids RETURN distinct(a.num_pmids), v.`_id` """ results = session.run(query, parameters={'d1_id': disease_pair[0], 'd2_id': disease_pair[1]}) for variant in results: num_pmids = variant['(a.num_pmids)'] if num_pmids is None: num_pmids = 0 var_id = variant['v.`_id`'] if var_id in disease_variants: temp = disease_variants[var_id] disease_variants[var_id] = temp + num_pmids else: disease_variants[var_id] = num_pmids dvs = '' i = 0 for key, value in sorted(disease_variants.items(), key=lambda item: item[1], reverse=True): if i < 9: # threshold num_pmids = disease_variants[key] variant = Variant([key], []) network.add_node(variant) network.add_edge(Edge(variant, d1, 'ASSOCIATES_WITH', {})) network.add_edge(Edge(variant, d2, 'ASSOCIATES_WITH', {})) dvs = dvs + key + ':' + str(num_pmids) + ' PMIDs, ' i += 1 dvs = dvs[:-2] # add the gene associated variants with smallest pvalues # if no pvalue is given, pvalue is set to 1 gene_variants = [] for gene in relevant_genes: query = """ MATCH (g:Gene)-[a]-(v:Variant) WHERE {g_id} in g.ids RETURN v.`_id`, a.pvalue, type(a) """ results = session.run(query, parameters={'g_id': gene}) for variant in results: pvalue = variant['a.pvalue'] if pvalue is None: pvalue = 1 else: pvalue = float(pvalue) gene_variants.append([variant['v.`_id`'] + '-' + gene, pvalue, variant['type(a)']]) gene_variants = sorted(gene_variants, key=lambda item: item[1]) gene_variants = gene_variants[:10] # threshold for v in gene_variants: temp = v[0].split('-') v_id = temp[0] g_id = temp[1] variant = Variant([v_id], []) network.add_node(variant) gene = Gene([g_id], []) network.add_node(gene) network.add_edge(Edge(gene, variant, v[2], {'pvalue': v[1]})) if len(gene_variants) > 0: gvs = str(gene_variants) gvs = gvs[1:-1] gvs = gvs.replace('\'', '') else: gvs = '' if len(disease_variants) > 0 or len(gene_variants) > 0: results_file.write('The disease pair has at least ' + str(i) + ' variants associated to both diseases: ' + dvs + ' and at least ' + str(len(gene_variants)) + ' gene associated variants: ' + gvs + '\n') # dict with RNA name as key and an array as value # first array position is the number of regulated genes, second position is an array with the gene names relevant_rnas = {} for gene in relevant_genes: query = """ MATCH (r:RNA)--(g:Gene) WHERE {g_id} in g.ids AND NOT r.label_id CONTAINS "MRNA" return r.`_id` """ results = session.run(query, parameters={'g_id': gene}) for result in results: key = result['r.`_id`'] if key in relevant_rnas: value = relevant_rnas[key] genes = value[1] if gene not in genes: genes.add(gene) relevant_rnas[key] = [value[0] + 1, genes] else: genes = set() genes.add(gene) relevant_rnas[key] = [1, genes] if len(relevant_rnas) > 0: i = 0 for key, value in sorted(relevant_rnas.items(), key=lambda item: item[1], reverse=True): # sort by the number of regulated genes if i > 9: # threshold break elif value[0] > 1: # only add and print RNAs which regulate more than one gene if i == 0: results_file.write('RNAs with the number and names of the genes they regulate: \n') rna_id = key for gene_id in value[1]: rna = RNA([rna_id], []) network.add_node(rna) gene = Gene([gene_id], []) network.add_node(gene) network.add_edge(Edge(rna, gene, 'REGULATES', {})) regulated_genes = str(value[1]) regulated_genes = regulated_genes[1:-1] regulated_genes = regulated_genes.replace('\'', '') results_file.write(rna_id + '\t' + str(value[0]) + '\t' + regulated_genes + '\n') i += 1 # append regulating RNAs to one RNA which regulates the most genes, MRNAs are not added for key, value in sorted(relevant_rnas.items(), key=lambda item: item[1], reverse=True): if value[0] > 1: most_relevant_rna = RNA([key], []) network.add_node(most_relevant_rna) query = """ MATCH (r:RNA)--(n:RNA) WHERE {r_id} in r.ids AND NOT n.label_id CONTAINS "MRNA" RETURN n.`_id`, labels(n) """ results = session.run(query, parameters={'r_id': key}) reg_rnas = '' for result in results: rna_id = result['n.`_id`'] types = result['labels(n)'] for type in types: if type != 'RNA': if type == 'CircRNA': rna = CircRNA([rna_id], []) if type == 'ERNA': rna = ERNA([rna_id], []) if type == 'LncRNA': rna = LncRNA([rna_id], []) if type == 'MiRNA': rna = MiRNA([rna_id], []) if type == 'NcRNA': rna = NcRNA([rna_id], []) if type == 'PiRNA': rna = PiRNA([rna_id], []) if type == 'Pseudogene': rna = Pseudogene([rna_id], []) if type == 'Ribozyme': rna = Ribozyme([rna_id], []) if type == 'RRNA': rna = RRNA([rna_id], []) if type == 'ScaRNA': rna = ScaRNA([rna_id], []) if type == 'ScRNA': rna = ScRNA([rna_id], []) if type == 'SnoRNA': rna = SnoRNA([rna_id], []) if type == 'SnRNA': rna = SnRNA([rna_id], []) network.add_node(rna) network.add_edge(Edge(rna, most_relevant_rna, 'REGULATES', {})) reg_rnas = reg_rnas + rna_id + ', ' reg_rnas = reg_rnas[:-2] results_file.write(key + ' is the RNA which regulates the most genes in this subgraph. It is regulated by ' + reg_rnas + '.\n') break json_file = path + '/' + temp_id1 + '_' + temp_id2 + '_' + temp_drug_id + '_graph.json' network.save(json_file) draw_drug_subgraph(json_file) networks_per_drug.append(network) all_networks.append(networks_per_drug) return all_networks
mirna_name = row[1] gene_hgnc_id = 'HGNC:' + row[3] gene_entrez_id = int(row[4]) gene_entrez_id = 'Entrez:' + str(gene_entrez_id) pmid = int(row[8]) pmid = str(pmid) with io.open(mirna_to_URS_mapping_file, 'r', encoding='utf-8', newline='') as mapping_file: mapping_reader = csv.reader(mapping_file, delimiter='\t') next(mapping_reader, None) for mapping_row in mapping_reader: if mirna_name == mapping_row[2]: mirna_rnacentral_id = mapping_row[0] mirna = MiRNA([mirna_rnacentral_id], [mirna_name]) network.add_node(mirna) gene = Gene([gene_hgnc_id, gene_entrez_id], []) network.add_node(gene) if (mirna_rnacentral_id + '$' + gene_hgnc_id) in edge_source_target_lookup: edges = network.get_edges_from_to(mirna, gene, 'REGULATES') for edge in edges: pmid = edge.attributes['pmid'] + ', ' + str(pmid) network.delete_edge(edge) e = Edge(mirna, gene, 'REGULATES', {'source': 'miRTarBase', 'pmid': pmid}) network.add_edge(e) edge_source_target_lookup.append(mirna_rnacentral_id + '$' + gene_hgnc_id) else: e = Edge(mirna, gene, 'REGULATES', {'source': 'miRTarBase', 'pmid': pmid}) network.add_edge(e) edge_source_target_lookup.append(mirna_rnacentral_id + '$' + gene_hgnc_id) break network.save('data/miRTarBase/graph.json')
with urllib.request.urlopen(url) as response, open(file, 'wb') as f: f.write(response.read()) network = Network() with io.open(file, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: row = [x.strip() for x in row] if not row[0] or not row[7] or not row[8]: continue gene_ids = {'HGNC:%s' % row[0]} if row[2]: gene_ids.add('Entrez:%s' % row[2]) gene = Gene(gene_ids, []) network.add_node(gene) drug_name = row[7].replace('(%s)' % row[8], '').replace(row[8], '').strip() drug = Drug(['ChEMBL:%s' % row[8]], [drug_name] if drug_name else []) network.add_node(drug) rel = { 'source': 'DGIdb,%s' % row[3], 'actions': [row[4]], } if row[9]: pubmed_ids = ','.join( ['PMID:%s' % x for x in row[9].strip().split(',')]) rel['source'] += ',%s' % pubmed_ids network.add_edge(Edge(drug, gene, 'TARGETS', rel))
reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: if row[3] == 'H**o sapiens' and row[6] == 'H**o sapiens' and float( row[7]) > 0.9: interactor_a_name = row[1] interactor_a_type = row[2] interactor_b_name = row[4] interactor_b_type = row[5] interactor_a = add_rna(interactor_a_name, interactor_a_type, node_lookup) interactor_b = add_rna(interactor_b_name, interactor_b_type, node_lookup) if interactor_a is not None and interactor_b is not None: if interactor_a_type == 'mRNA': gene = Gene([interactor_a.id], []) network.add_node(gene) e = Edge(gene, interactor_a, 'TRANSCRIBES', {}) network.add_edge(e) elif interactor_b_type == 'mRNA': gene = Gene([interactor_b.id], []) network.add_node(gene) e = Edge(gene, interactor_b, 'TRANSCRIBES', {}) network.add_edge(e) e = Edge(interactor_a, interactor_b, 'REGULATES', {'source': 'RNAInter'}) network.add_edge(e) network.save('../data/RNAInter/graph.json')
def value_empty(s: str) -> bool: return not s or s.strip() == '-' network = Network() with io.open(file_cis, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: if value_empty(row[1]) or value_empty(row[13]): continue variant = Variant(['dbSNP:%s' % row[1]], []) network.add_node(variant) for gene_id in row[13].split(','): gene = Gene(['HGNC:%s' % gene_id], []) network.add_node(gene) rel = { 'source': 'PMID:24013639', 'pvalue': row[0], 'snp_chr': row[2], 'cis_trans': row[7] } network.add_edge(Edge(gene, variant, 'EQTL', rel)) with io.open(file_trans, 'r', encoding='utf-8', newline='') as f: reader = csv.reader(f, delimiter='\t', quotechar='"') next(reader, None) for row in reader: if value_empty(row[1]) or value_empty(row[16]): continue
gene_ensembl = row[1].split(':') gene_ensembl_id = gene_ensembl[1] gene_hgnc_id = 'None' with io.open(gene_mapping_file, 'r', encoding='utf-8', newline='') as gm: gene_mapping_reader = csv.reader(gm, delimiter='\t') next(gene_mapping_reader, None) for gene_mapping_row in gene_mapping_reader: if gene_mapping_row[2] == gene_ensembl_id: gene_hgnc_id = 'HGNC:' + gene_mapping_row[1] break if gene_hgnc_id != 'None' and 'gene' in row[21]: gene_uniprotkb_id = re.split('[:(]', row[5]) gene_uniprotkb_id = 'UniProtKB:' + gene_uniprotkb_id[1] gene_ensembl_id = 'Ensembl:' + gene_ensembl_id gene = Gene([gene_hgnc_id, gene_uniprotkb_id, gene_ensembl_id], []) network.add_node(gene) pmid = row[8].split(':') pmid = pmid[1] source_database = row[12] source_database = source_database.replace('\"', '') if (mirna_rnacentral_id + '$' + gene_hgnc_id) in edge_source_target_lookup: reg_edges = network.get_edges_from_to( mirna, gene, 'REGULATES') for reg_edge in reg_edges: if reg_edge.attributes['source'] == ( 'EBI-GOA-miRNA, ' + source_database): pmid = reg_edge.attributes['pmid'] + ', ' + pmid network.delete_edge(reg_edge) e = Edge(
# 2 - DSI # 3 - DPI # 4 - diseaseId # 5 - diseaseName # 6 - diseaseType # 7 - diseaseClass # 8 - diseaseSemanticType # 9 - score # 10 - EI # 11 - YearInitial # 12 - YearFinal # 13 - NofPmids # 14 - NofSnps # 15 - source if int(row[13]) >= PUBMED_COUNT_THRESHOLD: gene = Gene(['HGNC:%s' % row[1]], []) network.add_node(gene) disease = Disease(['UMLS:%s' % row[4]], [row[5]]) network.add_node(disease) rel = { 'source': 'DisGeNet,%s' % row[15], 'num_pmids': int(row[13]), 'num_snps': int(row[14]), 'score': row[9] } network.add_edge(Edge(gene, disease, 'ASSOCIATES_WITH', rel)) with io.open('../data/DisGeNet/curated_variant_disease_associations.tsv', 'r', encoding='utf-8', newline='') as f: