def process_rnai_phenotypes(self, limit=None): raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing RNAi phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_num, gene_alt_symbol, phenotype_label, phenotype_id, rnai_and_refs) = row # WBGene00001908 F17E9.9 locomotion variant WBPhenotype:0000643 WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395 # WBGene00001908 F17E9.9 avoids bacterial lawn WBPhenotype:0000402 WBRNAi00095640|WBPaper00040984 # WBGene00001908 F17E9.9 RAB-11 recycling endosome localization variant WBPhenotype:0002107 WBRNAi00090830|WBPaper00041129 if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num # refs = list() # TODO unused # the rnai_and_refs has this so that # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054 # space delimited between RNAi sets; # then each RNAi should have a paper rnai_sets = re.split(r' ', rnai_and_refs) for s in rnai_sets: # get the rnai_id (rnai_num, ref_num) = re.split(r'\|', s) if len(re.split(r'\|', s)) > 2: logger.warning( "There's an unexpected number of items in %s", s) if rnai_num not in self.rnai_gene_map: self.rnai_gene_map[rnai_num] = set() # to use for looking up later self.rnai_gene_map[rnai_num].add(gene_num) rnai_id = 'WormBase:'+rnai_num geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) # make the "allele" of the gene # that is targeted by the reagent allele_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) allele_label = gene_alt_symbol+'<'+rnai_num+'>' geno.addReagentTargetedGene( rnai_id, gene_id, allele_id, allele_label) assoc = G2PAssoc(self.name, allele_id, phenotype_id) assoc.add_source('WormBase:'+ref_num) # eco_id = 'ECO:0000019' # RNAi evidence # TODO unused assoc.add_association_to_graph(g) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning( "Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:'+allele_num gene_id = 'WormBase:'+gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) geno.addReagentTargetedGene( rnai_id, 'WormBase:'+gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_'+'-'.join((gene_num, allele_num)) if self.nobnodes: vl_id = ':'+vl_id geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) r = Reference(ref) if re.search(r'Person', ref): r.setType(r.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') r.addRefToGraph(g) assoc.add_source(ref) assoc.add_association_to_graph(g) # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning("Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num) geno.addReagentTargetedGene( rnai_id, 'WormBase:' + gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_:' + '-'.join((gene_num, allele_num)) geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(g, self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(g, ref) if re.search(r'Person', ref): reference.setType( reference.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_rnai_phenotypes(self, limit=None): raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing RNAi phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_num, gene_alt_symbol, phenotype_label, phenotype_id, rnai_and_refs) = row # WBGene00001908 F17E9.9 locomotion variant WBPhenotype:0000643 WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395 # WBGene00001908 F17E9.9 avoids bacterial lawn WBPhenotype:0000402 WBRNAi00095640|WBPaper00040984 # WBGene00001908 F17E9.9 RAB-11 recycling endosome localization variant WBPhenotype:0002107 WBRNAi00090830|WBPaper00041129 if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:' + gene_num # refs = list() # TODO unused # the rnai_and_refs has this so that # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054 # space delimited between RNAi sets; # then each RNAi should have a paper rnai_sets = re.split(r' ', rnai_and_refs) for s in rnai_sets: # get the rnai_id (rnai_num, ref_num) = re.split(r'\|', s) if len(re.split(r'\|', s)) > 2: logger.warning( "There's an unexpected number of items in %s", s) if rnai_num not in self.rnai_gene_map: self.rnai_gene_map[rnai_num] = set() # to use for looking up later self.rnai_gene_map[rnai_num].add(gene_num) rnai_id = 'WormBase:' + rnai_num geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) # make the "allele" of the gene # that is targeted by the reagent allele_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num) allele_label = gene_alt_symbol + '<' + rnai_num + '>' geno.addReagentTargetedGene(rnai_id, gene_id, allele_id, allele_label) assoc = G2PAssoc(g, self.name, allele_id, phenotype_id) assoc.add_source('WormBase:' + ref_num) # eco_id = 'ECO:0000019' # RNAi evidence # TODO unused assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_rnai_phenotypes(self, limit=None): src_key = 'rnai_pheno' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing: %s", self.files[src_key]['file']) graph = self.graph geno = Genotype(graph) col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header row to check collen = len(col) for row in reader: if len(row) != collen: LOG.error('In %s line %i expected %i colums but got %s.', self.files[src_key]['file'], reader.line_num, collen, row) pass gene_num = row[col.index('gene_num')] gene_alt_symbol = row[col.index('gene_alt_symbol')] # phenotype_label = row[col.index('phenotype_label')] phenotype_id = row[col.index('phenotype_id')] rnai_and_refs = row[col.index('rnai_and_refs')] gene_curie = 'WormBase:' + gene_num ''' WBGene00001908 F17E9.9 locomotion variant WBPhenotype:0000643 WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395 WBGene00001908 F17E9.9 avoids bacterial lawn WBPhenotype:0000402 WBRNAi00095640|WBPaper00040984 WBGene00001908 F17E9.9 RAB-11 recycling endosome localization variant WBPhenotype:0002107 WBRNAi00090830|WBPaper00041129 ''' # the rnai_and_refs has this so that ''' WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054 ''' # space delimited between RNAi sets; # then each RNAi should have a paper rnai_sets = re.split(r' ', rnai_and_refs) for rnais in rnai_sets: # get the rnai_id pair = rnais.split('|') if len(pair) > 2: LOG.warning( "There's an unexpected number of items in %s", rnais) else: (rnai_num, ref_num) = pair if rnai_num not in self.rnai_gene_map: self.rnai_gene_map[rnai_num] = set() # to use for looking up later self.rnai_gene_map[rnai_num].add(gene_num) rnai_curie = 'WormBase:' + rnai_num geno.addGeneTargetingReagent(rnai_curie, None, self.globaltt['RNAi_reagent'], gene_curie) # make the "allele" of the gene # that is targeted by the reagent allele_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num) allele_label = gene_alt_symbol + '<' + rnai_num + '>' geno.addReagentTargetedGene(rnai_curie, gene_curie, allele_id, allele_label) assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id) assoc.add_source('WormBase:' + ref_num) # eco_id = 'ECO:0000019' # RNAi evidence # TODO unused assoc.add_association_to_graph() if limit is not None and reader.line_num > limit: break