def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] print(row) interaction_id = 'WormBase:'+interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info( "An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:'+row[5] gene_b_id = 'WormBase:'+row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc( self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way gu.addDescription(g, assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_interactions(self, limit): logger.info("getting interactions") line_counter = 0 f = '/'.join((self.rawdir, self.files['interactions']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] matchcounter = 0 with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): logger.debug("Skipping header line") continue line_counter += 1 line = line.decode().strip() # print(line) (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a, aliases_b, detection_method, pub_author, pub_id, taxid_a, taxid_b, interaction_type, source_db, interaction_id, confidence_val) = line.split('\t') # get the actual gene ids, # typically formated like: gene/locuslink:351|BIOGRID:106848 gene_a_num = re.search( r'locuslink\:(\d+)\|?', interactor_a).groups()[0] gene_b_num = re.search( r'locuslink\:(\d+)\|?', interactor_b).groups()[0] if self.testMode: g = self.testgraph # skip any genes that don't match our test set if (int(gene_a_num) not in self.test_ids) or\ (int(gene_b_num) not in self.test_ids): continue else: g = self.graph # when not in test mode, filter by taxon if int(re.sub(r'taxid:', '', taxid_a.rstrip())) not in\ self.tax_ids or\ int(re.sub( r'taxid:', '', taxid_b.rstrip())) not in\ self.tax_ids: continue else: matchcounter += 1 gene_a = 'NCBIGene:'+gene_a_num gene_b = 'NCBIGene:'+gene_b_num # get the interaction type # psi-mi:"MI:0407"(direct interaction) int_type = re.search(r'MI:\d+', interaction_type).group() rel = self._map_MI_to_RO(int_type) # scrub pubmed-->PMID prefix pub_id = re.sub(r'pubmed', 'PMID', pub_id) # remove bogus whitespace pub_id = pub_id.strip() # get the method, and convert to evidence code det_code = re.search(r'MI:\d+', detection_method).group() evidence = self._map_MI_to_ECO(det_code) # note that the interaction_id is some kind of internal biogrid # identifier that does not map to a public URI. # we will construct a monarch identifier from this assoc = InteractionAssoc(self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence) assoc.add_source(pub_id) assoc.add_association_to_graph(g) assoc.load_all_properties(g) if not self.testMode and ( limit is not None and line_counter > limit): break myzip.close() return
def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] # print(row) interaction_id = 'WormBase:' + interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info("An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:' + row[5] gene_b_id = 'WormBase:' + row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc(g, self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way model.addDescription(assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_interactions(self, limit): LOG.info("getting interactions") line_counter = 0 f = '/'.join((self.rawdir, self.files['interactions']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] matchcounter = 0 with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): LOG.debug("Skipping header line") continue line_counter += 1 line = line.decode().strip() # print(line) (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a, aliases_b, detection_method, pub_author, pub_id, taxid_a, taxid_b, interaction_type, source_db, interaction_id, confidence_val) = line.split('\t') taxid_a = taxid_a.rstrip() taxid_b = taxid_b.rstrip() # get the actual gene ids, # typically formated like: gene/locuslink:351|BIOGRID:106848 gene_a_num = re.search(r'locuslink\:(\d+)\|?', interactor_a).groups()[0] gene_b_num = re.search(r'locuslink\:(\d+)\|?', interactor_b).groups()[0] if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if (int(gene_a_num) not in self.test_ids) or\ (int(gene_b_num) not in self.test_ids): continue else: graph = self.graph # when not in test mode, filter by taxon if taxid_a.split(':')[-1] not in self.tax_ids or \ taxid_b.split(':')[-1] not in self.tax_ids: continue else: matchcounter += 1 gene_a = 'NCBIGene:' + gene_a_num gene_b = 'NCBIGene:' + gene_b_num # get the interaction type # psi-mi:"MI:0407"(direct interaction) int_type = re.search(r'MI:\d+', interaction_type).group() rel = self.resolve(int_type, False) if rel == int_type: rel = self.globaltt['interacts with'] # scrub pubmed-->PMID prefix pub_id = re.sub(r'pubmed', 'PMID', pub_id) # remove bogus whitespace pub_id = pub_id.strip() # get the method, and convert to evidence code det_code = re.search(r'MI:\d+', detection_method).group() evidence = self.resolve(det_code, False) if evidence == det_code: evidence = self.globaltt["experimental evidence"] # note that the interaction_id is some kind of internal biogrid # identifier that does not map to a public URI. # we will construct a monarch identifier from this assoc = InteractionAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence) assoc.add_source(pub_id) assoc.add_association_to_graph() if not self.test_mode and (limit is not None and line_counter > limit): break myzip.close() return