def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': gu.addDefinition(g, gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text gu.addDescription(g, gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' gu = GraphUtils(curie_map.get()) with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:'+str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(mpd_strainid) gu.addIndividualToGraph(g, strain_id, strain_name, tax_id) if mpdshortname.strip() != '': gu.addSynonym(g, strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum gu.addSameIndividual(g, strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) gu.addSameIndividual(g, strain_id, reiken_id) else: if url != '': gu.addXref(g, strain_id, url, True) if vendor != '': gu.addXref( g, strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' gu.addDescription(g, strain_id, desc) # TODO make the panels as a resource collection return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']: continue # FIXME: What's the proper route for this? # The orthology class is essentially a KEGG gene ID that is species agnostic. # Add the ID and label as a class. Would it be considered a gene as well? other_labels = re.split(';', orthology_class_name) orthology_label = other_labels[0] # the first one is the label we'll use orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: gu.addSynonym(g, orthology_class_id, s) # add the last one as the description gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1]) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with ortholog classes") return
def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] print(row) interaction_id = 'WormBase:'+interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info( "An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:'+row[5] gene_b_id = 'WormBase:'+row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc( self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way gu.addDescription(g, assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: gu.addSynonym(g, fid, name) if desc is not None: gu.addDescription(g, fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: gu.addSynonym(g, fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) f = Feature(fid, flabel, ftype) f.addFeatureStartLocation(start, chr_id, strand) f.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True f.addFeatureToGraph(g, True, None, feature_is_class) if note is not None: gu.addDescription(g, fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
class Assoc: """ An abstract class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = { 'association': 'OBAN:association' } annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', } object_properties = { 'has_disposition': 'GENO:0000208', 'has_phenotype': 'RO:0002200', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_object_property', 'is_about': 'IAO:00000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) OWLCLASS = OWL['Class'] OWLIND = OWL['NamedIndividual'] OBJECTPROP = OWL['ObjectProperty'] ANNOTPROP = OWL['AnnotationProperty'] DATAPROP = OWL['DatatypeProperty'] SUBCLASS = RDFS['subClassOf'] BASE = Namespace(curie_map.get()['']) def __init__(self, definedby): self.cu = CurieUtil(curie_map.get()) self.gu = GraphUtils(curie_map.get()) # core parts of the association self.definedby = definedby self.sub = self.obj = self.rel = None self.assoc_id = None self.description = None self.source = [] self.evidence = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self, g): if not self._is_valid(): return # first, add the direct triple # anonymous (blank) nodes are indicated with underscore s = self.gu.getNode(self.sub) o = self.gu.getNode(self.obj) p = self.gu.getNode(self.rel) if s is None: logging.error( "Unable to retrieve graph node for Subject %s ", self.sub) return elif p is None: logging.error( "Unable to retrieve graph node for Predicate %s ", self.rel) return elif o is None: logging.error( "Unable to retrieve graph node for Object %s ", self.obj) return else: g.add((s, p, o)) if self.assoc_id is None: self.set_association_id() node = self.gu.getNode(self.assoc_id) g.add((node, RDF['type'], self.gu.getNode(self.assoc_types['association']))) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_subject'], self.sub) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_object'], self.obj) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.gu.addDescription(g, self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s, True) else: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_provenance'], p) if self.score is not None: self.gu.addTriple( g, self.assoc_id, self.properties['has_measurement'], Literal(self.score, datatype=XSD['float']), True) # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self, g): self._add_basic_association_to_graph(g) return def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return def load_all_properties(self, g): props = { self.OBJECTPROP: self.object_properties, self.ANNOTPROP: self.annotation_properties, self.DATAPROP: self.datatype_properties } for p in props: self.gu.loadProperties(g, props[p], p) return def _get_source_uri(self, pub_id): """ Given some kind of pub_id (which might be a CURIE or url), convert it into a proper node. :param pub_id: :return: source: Well-formed URI for the given identifier (or url) """ source = None if re.compile('http').match(pub_id): source = URIRef(pub_id) else: u = self.gu.getNode(pub_id) if u is not None: source = URIRef(u) else: logger.error( "An id we don't know how to deal with: %s", pub_id) return source @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively md5 hashes the (+)-joined string from the values. Subclasses of Assoc can submit an additional array of attributes that will be added to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # TEC: at our scale, md5 is in danger of having collisions. # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and \ orthology_class_id not in \ self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: gu.addSynonym(g, orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] gu.addDescription(g, orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: gu.addXref(g, orthology_class_id, 'EC:'+ecm) if not self.testMode and \ limit is not None and line_counter > limit: break logger.info("Done with ortholog classes") return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) line_counter = 0 gu.loadAllProperties(g) gu.loadObjectProperties(g, geno.object_properties) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus gu.addClassToGraph(g, taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_'+re.sub(r'\W+', '_', colony) if self.nobnodes: colony_id = ':'+colony_id if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_IMPC-'+re.sub(r':', '', allele_accession_id) if self.nobnodes: allele_accession_id = ':'+allele_accession_id if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_'+strain_accession_id if self.nobnodes: strain_accession_id = ':'+strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning( "Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_seqalt'+re.sub(r':', '', allele_accession_id) if self.nobnodes: sequence_alteration_id = ':'+sequence_alteration_id geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_'+allele_accession_id+geno.zygosity['indeterminate'] vslc_colony = re.sub(r':', '', vslc_colony) if self.nobnodes: vslc_colony = ':'+vslc_colony vslc_colony_label = allele_symbol+'/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) gu.addTriple( g, colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '_' + '-'.join((marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':'+vslc_id gu.addIndividualToGraph( g, vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc gu.addType( g, vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '/' + phenotyping_center pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_'+pheno_center_strain_id if self.nobnodes: pheno_center_strain_id = ':'+pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(pheno_center_strain_id, taxon_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) genotype_name += '['+colony+']' geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name+' ('+sex+')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # experimental_phenotypic_evidence This was used in ZFIN eco_id = "ECO:0000059" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # add a free-text description description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) gu.addDescription(g, assoc_id, description) # TODO add provenance information # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) return
class OMIA(Source): """ This is the parser for the [Online Mendelian Inheritance in Animals (OMIA)](http://www.http://omia.angis.org.au), from which we process inherited disorders, other (single-locus) traits, and genes in >200 animal species (other than human and mouse and rats). We generate the omia graph to include the following information: * genes * animal taxonomy, and breeds as instances of those taxa (breeds are akin to "strains" in other taxa) * animal diseases, along with species-specific subtypes of those diseases * publications (and their mapping to PMIDs, if available) * gene-to-phenotype associations (via an anonymous variant-locus * breed-to-phenotype associations We make links between OMIA and OMIM in two ways: 1. mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM 2. mappings between a breed and OMIA disease are created to be a model for the mapped OMIM disease, IF AND ONLY IF it is a 1:1 mapping. there are some 1:many mappings, and these often happen if the OMIM item is a gene. Because many of these species are not covered in the PANTHER orthology datafiles, we also pull any orthology relationships from the gene_group files from NCBI. """ files = { 'data': { 'file': 'omia.xml.gz', 'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'}, } def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: """ self.get_files(is_dl_forced) ncbi = NCBIGene() # ncbi.fetch() gene_group = ncbi.files['gene_group'] self.fetch_from_url( gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])), False) return def parse(self, limit=None): # names of tables to iterate - probably don't need all these: # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword, # Article_People, Article_Phene, Articles, Breed, Breed_Phene, # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords, # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People, # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms self.scrub() if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # we do three passes through the file # first process species (two others reference this one) self.process_species(limit) # then, process the breeds, genes, articles, and other static stuff self.process_classes(limit) # next process the association data self.process_associations(limit) # process the vertebrate orthology for genes # that are annotated with phenotypes ncbi = NCBIGene() ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes) self.load_core_bindings() self.load_bindings() logger.info("Done parsing.") self.write_molgen_report() return def scrub(self): """ The XML file seems to have mixed-encoding; we scrub out the control characters from the file for processing. :return: """ logger.info( "Scrubbing out the nasty characters that break our parser.") myfile = '/'.join((self.rawdir, self.files['data']['file'])) tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz')) t = gzip.open(tmpfile, 'wb') du = DipperUtil() with gzip.open(myfile, 'rb') as f: filereader = io.TextIOWrapper(f, newline="") for l in filereader: l = du.remove_control_characters(l) + '\n' t.write(l.encode('utf-8')) t.close() # move the temp file logger.info("Replacing the original data with the scrubbed file.") shutil.move(tmpfile, myfile) return # ###################### XML LOOPING FUNCTIONS ################## def process_species(self, limit): """ Loop through the xml file and process the species. We add elements to the graph, and store the id-to-label in the label_hash dict. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # Species ids are == genbank species ids! self.process_xml_table( elem, 'Species_gb', self._process_species_table_row, limit) f.close() return def process_classes(self, limit): """ Loop through the xml file and process the articles, breed, genes, phenes, and phenotype-grouping classes. We add elements to the graph, and store the id-to-label in the label_hash dict, along with the internal key-to-external id in the id_hash dict. The latter are referenced in the association processing functions. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line parser = ET.XMLParser(encoding='utf-8') for event, elem in ET.iterparse(filereader, parser=parser): self.process_xml_table( elem, 'Articles', self._process_article_row, limit) self.process_xml_table( elem, 'Breed', self._process_breed_row, limit) self.process_xml_table( elem, 'Genes_gb', self._process_gene_row, limit) self.process_xml_table( elem, 'OMIA_Group', self._process_omia_group_row, limit) self.process_xml_table( elem, 'Phene', self._process_phene_row, limit) self.process_xml_table( elem, 'Omim_Xref', self._process_omia_omim_map, limit) f.close() # post-process the omia-omim associations to filter out the genes # (keep only phenotypes/diseases) self.clean_up_omim_genes() return def process_associations(self, limit): """ Loop through the xml file and process the article-breed, article-phene, breed-phene, phene-gene associations, and the external links to LIDA. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): self.process_xml_table( elem, 'Article_Breed', self._process_article_breed_row, limit) self.process_xml_table( elem, 'Article_Phene', self._process_article_phene_row, limit) self.process_xml_table( elem, 'Breed_Phene', self._process_breed_phene_row, limit) self.process_xml_table( elem, 'Lida_Links', self._process_lida_links_row, limit) self.process_xml_table( elem, 'Phene_Gene', self._process_phene_gene_row, limit) self.process_xml_table( elem, 'Group_MPO', self._process_group_mpo_row, limit) f.close() return # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################ def _process_species_table_row(self, row): # gb_species_id, sci_name, com_name, added_by, date_modified tax_id = 'NCBITaxon:'+str(row['gb_species_id']) sci_name = row['sci_name'] com_name = row['com_name'] if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return self.gu.addClassToGraph(self.g, tax_id, sci_name) if com_name != '': self.gu.addSynonym(self.g, tax_id, com_name) self.label_hash[tax_id] = com_name # for lookup later else: self.label_hash[tax_id] = sci_name return def _process_breed_row(self, row): # in test mode, keep all breeds of our test species if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return # save the breed keys in the test_ids for later processing self.test_ids['breed'] += [int(row['breed_id'])] breed_id = self.make_breed_id(row['breed_id']) self.id_hash['breed'][row['breed_id']] = breed_id tax_id = 'NCBITaxon:'+str(row['gb_species_id']) breed_label = row['breed_name'] species_label = self.label_hash.get(tax_id) if species_label is not None: breed_label = breed_label + ' ('+species_label+')' self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id) self.label_hash[breed_id] = breed_label return def _process_phene_row(self, row): phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) self.gu.addClassToGraph( self.g, sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': self.gu.addDescription( self.g, sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) self.gu.addOWLPropertyClassRestriction( self.g, sp_phene_id, self.gu.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph(self.g) if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return def write_molgen_report(self): import csv logger.info("Writing G2P report for OMIA") f = '/'.join((self.outdir, 'omia_molgen_report.txt')) with open(f, 'w', newline='\n') as csvfile: writer = csv.writer(csvfile, delimiter='\t') # write header h = ['omia_id', 'molecular_description', 'mapping_info', 'species'] writer.writerow(h) for phene in self.stored_omia_mol_gen: writer.writerow((str(phene), self.stored_omia_mol_gen[phene]['mol_gen'], self.stored_omia_mol_gen[phene]['map_info'], self.stored_omia_mol_gen[phene]['species'])) logger.info( "Wrote %d potential G2P descriptions for curation to %s", len(self.stored_omia_mol_gen), f) return def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return def _process_omia_group_row(self, row): omia_id = 'OMIA:'+row['omia_id'] if self.testMode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] disease_id = None group_category = row.get('group_category') disease_id = \ self.map_omia_group_category_to_ontology_id(group_category) if disease_id is not None: self.gu.addClassToGraph(self.g, disease_id, None) if disease_id == 'MP:0008762': # embryonic lethal # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.name, omia_id, disease_id) assoc.add_association_to_graph(self.g) disease_id = None else: logger.info( "No disease superclass defined for %s: %s", omia_id, group_name) # default to general disease FIXME this may not be desired disease_id = 'DOID:4' if group_summary == '': group_summary = None if group_name == '': group_name = None self.gu.addClassToGraph( self.g, omia_id, group_name, disease_id, group_summary) self.label_hash[omia_id] = group_name return def _process_gene_row(self, row): if self.testMode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:'+str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:'+str(row['gb_species_id']) gene_type_id = NCBIGene.map_type_of_gene(row['gene_type']) self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id) self.geno.addTaxon(tax_id, gene_id) return def _process_article_breed_row(self, row): # article_id, breed_id, added_by # don't bother putting these into the test... too many! # and int(row['breed_id']) not in self.test_ids['breed']: if self.testMode: return article_id = self.id_hash['article'].get(row['article_id']) breed_id = self.id_hash['breed'].get(row['breed_id']) # there's some missing data (article=6038). in that case skip if article_id is not None: self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], breed_id) else: logger.warning("Missing article key %s", str(row['article_id'])) return def _process_article_phene_row(self, row): """ Linking articles to species-specific phenes. :param row: :return: """ # article_id, phene_id, added_by # look up the article in the hashmap phenotype_id = self.id_hash['phene'].get(row['phene_id']) article_id = self.id_hash['article'].get(row['article_id']) omia_id = self._get_omia_id_from_phene_id(phenotype_id) if self.testMode and omia_id not in self.test_ids['disease'] \ or phenotype_id is None or article_id is None: return # make a triple, where the article is about the phenotype self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], phenotype_id) return def _process_breed_phene_row(self, row): # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if (self.testMode and not ( omia_id in self.test_ids['disease'] and int(row['breed_id']) in self.test_ids['breed']) or breed_id is None or phene_id is None): return # FIXME we want a different relationship here assoc = G2PAssoc( self.name, breed_id, phene_id, self.gu.object_properties['has_phenotype']) assoc.add_association_to_graph(self.g) # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = "ECO:0000214" # biological aspect of descendant evidence if omim_ids is not None and len(omim_ids) > 0: if len(omim_ids) > 1: logger.info( "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) for i in omim_ids: assoc = G2PAssoc( self.name, breed_id, i, self.gu.object_properties['model_of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph(self.g) aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: breed_label = "this breed" m = re.search(r'\((.*)\)', breed_label) if m: sp_label = m.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in '+sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", i + ".")) self.gu.addDescription(self.g, aid, desc) return def _process_lida_links_row(self, row): # lidaurl, omia_id, added_by omia_id = 'OMIA:'+row['omia_id'] lidaurl = row['lidaurl'] if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, lidaurl, True) return def _process_phene_gene_row(self, row): gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' if self.nobnodes: vl = ':'+vl self.geno.addAllele(vl, 'some variant of ' + gene_label) self.geno.addAlleleOfGene(vl, gene_id) assoc = G2PAssoc(self.name, vl, phene_id) assoc.add_association_to_graph(self.g) # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return def _process_omia_omim_map(self, row): """ Links OMIA groups to OMIM equivalents. :param row: :return: """ # omia_id, omim_id, added_by omia_id = 'OMIA:'+row['omia_id'] omim_id = 'OMIM:'+row['omim_id'] # also store this for use when we say that a given animal is # a model of a disease if omia_id not in self.omia_omim_map: self.omia_omim_map[omia_id] = set() self.omia_omim_map[omia_id].add(omim_id) if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, omim_id) return def map_omia_group_category_to_ontology_id(self, category_num): """ Using the category number in the OMIA_groups table, map them to a disease id. This may be superceeded by other MONDO methods. Platelet disorders will be more specific once https://github.com/obophenotype/human-disease-ontology/issues/46 is fulfilled. :param category_num: :return: """ category_map = { 1: 'DOID:0014667', # Inborn error of metabolism 2: 'MESH:D004392', # Dwarfism 3: 'DOID:1682', # congenital heart disease 4: 'DOID:74', # blood system disease 5: 'DOID:3211', # lysosomal storage disease 6: 'DOID:16', # integumentary system disease # --> retinal degeneration ==> OMIA:000830 7: 'DOID:8466', # progressive retinal atrophy 8: 'DOID:0050572', # Cone–rod dystrophy 9: 'MESH:C536122', # stationary night blindness 10: 'Orphanet:98553', # developmental retinal disorder 11: 'DOID:5679', # retinal disorder 12: 'Orphanet:90771', # Disorder of Sex Development # - what to do about this one? 13: 'MP:0008762', # embryonic lethal # - not sure what to do with this 14: None, # blood group # FIXME make me more specific 15: 'DOID:2218', # intrinsic platelet disorder # FIXME make me more specific 16: 'DOID:2218', # extrinsic platelet disorder 17: None # transgenic ??? } disease_id = None if category_num is not None and int(category_num) in category_map: disease_id = category_map.get(int(category_num)) logger.info( "Found %s for category %s", str(disease_id), str(category_num)) else: logger.info( "There's a group category I don't know anything about: %s", str(category_num)) return disease_id def _process_group_mpo_row(self, row): """ Make OMIA to MP associations :param row: :return: """ omia_id = 'OMIA:'+row['omia_id'] mpo_num = int(row['MPO_no']) mpo_id = 'MP:'+str(mpo_num).zfill(7) assoc = D2PAssoc(self.name, omia_id, mpo_id) assoc.add_association_to_graph(self.g) return def clean_up_omim_genes(self): omim = OMIM() # get all the omim ids allomimids = set() for omia in self.omia_omim_map: allomimids.update(self.omia_omim_map[omia]) entries_that_are_phenotypes = omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None) logger.info( "Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) # now iterate again and remove those non-phenotype ids removed_count = 0 for omia in self.omia_omim_map: ids = self.omia_omim_map[omia] cleanids = set() for i in ids: if i in entries_that_are_phenotypes: cleanids.add(i) else: removed_count += 1 # keep track of how many we've removed self.omia_omim_map[omia] = cleanids logger.info( "Removed %d omim ids from the omia-to-omim map", removed_count) return def _make_internal_id(self, prefix, key): iid = '_'+''.join(('omia', prefix, 'key', str(key))) if self.nobnodes: iid = ':'+iid return iid def make_breed_id(self, key): breed_id = 'OMIA-breed:'+str(key) return breed_id @staticmethod def _get_omia_id_from_phene_id(phene_id): omia_id = None if phene_id is not None: m = re.match(r'OMIA:\d+', str(phene_id)) if m: omia_id = m.group(0) return omia_id @staticmethod def _map_inheritance_term_id(inheritance_symbol): inherit_map = { 'A': None, # Autosomal 'ACD': 'GENO:0000143', # Autosomal co-dominant 'ADV': None, # autosomal dominant with variable expressivity 'AID': 'GENO:0000259', # autosomal incompletely dominant 'ASD': 'GENO:0000145', # autosomal semi-dominant # autosomal recessive, semi-lethal # using generic autosomal recessive 'ASL': 'GENO:0000150', 'D': 'GENO:0000147', # autosomal dominant 'M': None, # multifactorial 'MAT': None, # Maternal # probably autosomal recessive # using generic autosomal recessive 'PR': 'GENO:0000150', 'R': 'GENO:0000150', # Autosomal Recessive # Recessive Embryonic Lethal # using plain recessive 'REL': 'GENO:0000148', # Autosomal Recessive Lethal # using plain autosomal recessive 'RL': 'GENO:0000150', 'S': 'GENO:0000146', # Sex-linked <--using allosomal dominant 'SLi': None, # Sex-limited 'UD': 'GENO:0000144', # Dominant 'X': None, # x-linked # HP:0001417 ? # X-linked Dominant <-- temp using allosomal dominant FIXME 'XLD': 'GENO:0000146', # X-linked Recessive <-- temp using allosomal recessive FIXME 'XLR': 'GENO:0000149', 'Y': None, # Y-linked 'Z': None, # Z-linked # Z-linked recessive <-- temp using allosomal recessive FIXME 'ZR': 'GENO:0000149', '999': None, # Z-linked incompletely dominant } inheritance_id = inherit_map.get(inheritance_symbol) if inheritance_id is None and inheritance_symbol is not None: logger.warning( "No inheritance id is mapped for %s", inheritance_symbol) return inheritance_id def getTestSuite(self): import unittest from tests.test_omia import OMIATestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase) return test_suite
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 zfin = wbase = None if 7955 in self.tax_ids: zfin = ZFIN() elif 6239 in self.tax_ids: wbase = WormBase() with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n"+'\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue gu.addClassToGraph(g, gene_id, gene_symbol) if gene_name != '': gu.addDescription(g, gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): gu.addSynonym(g, gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph(g) assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': gu.object_properties['involved_in'], # involved in 'F': gu.object_properties['enables'], # enables 'C': gu.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = gu.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from '+uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph(g) # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i, self.nobnodes) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i, self.nobnodes) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc( self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub( prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph(g, self.nobnodes) # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ namespaces = { 'dctypes': 'http://purl.org/dc/dcmitype/', 'pav': 'http://purl.org/pav/', 'dcat': 'http://www.w3.org/ns/dcat#' } core_bindings = {'rdf': RDF, 'foaf': FOAF, 'xsd': XSD, 'dct': DCTERMS} def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None): DCTYPES = Namespace(self.namespaces['dctypes']) self.gu = GraphUtils(curie_map.get()) self.identifier = URIRef(':'+identifier) self.version = None self.date_issued = None self.date_accessed = None self.citation = set() self.set_access_date() self.license = license_url self.graph = Graph() self.load_bindings() self.graph.add((self.identifier, RDF['type'], DCTYPES['Dataset'])) self.graph.add((self.identifier, DCTERMS['title'], Literal(title))) self.graph.add( (self.identifier, DCTERMS['identifier'], Literal(identifier))) self.graph.add((self.identifier, FOAF['page'], URIRef(url))) self.dipperized_version = URIRef('monarch'+str(self.date_accessed)) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.add( (self.identifier, DCTERMS['license'], URIRef(license_url))) else: logger.debug('No license provided.') if data_rights is not None: self.graph.add( (self.identifier, DCTERMS['rights'], Literal(data_rights))) else: logger.debug('No rights provided.') if description is not None: self.gu.addDescription(self.graph, self.identifier, description) return def load_bindings(self): for k in self.core_bindings: v = self.core_bindings[k] self.graph.bind(k, v) for k in self.namespaces.keys(): v = self.namespaces[k] self.graph.bind(k, Namespace(v)) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: # this shouldn't happen self.set_version_by_num(version_id) else: logger.error("No date or version set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.add( (self.identifier, DCTERMS['issued'], Literal(date_issued))) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info( "No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): PAV = Namespace(self.namespaces['pav']) self.version = URIRef(self.identifier+version_num) self.graph.add((self.version, DCTERMS['isVersionOf'], self.identifier)) self.graph.add((self.version, PAV['version'], Literal(version_num))) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: self.dipperized_version = URIRef('monarch'+str(self.date_accessed)) self.graph.add( (self.dipperized_version, DCTERMS['isVersionOf'], self.version)) self.graph.add( (self.dipperized_version, PAV['version'], Literal(self.date_accessed))) self.graph.add( (self.dipperized_version, DCTERMS['issued'], Literal(self.date_accessed, datatype=XSD.dateTime))) return def set_access_date(self): t = datetime.now() t_string = t.strftime("%Y-%m-%d-%H-%M") d = t_string self.date_accessed = d logger.info("Setting date of access to %s", self.date_accessed) return def setFileAccessUrl(self, url): DCAT = Namespace(self.namespaces['dcat']) self.graph.add((self.identifier, DCAT['accessURL'], URIRef(url))) return def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # gu.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return