def __init__(self): Source.__init__(self, 'mpd') # @N, not sure if this step is required self.namespaces.update(curie_map.get()) self.stdevthreshold = 2 self.nobnodes = True # FIXME # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} self.geno = Genotype(self.graph) self.gu = GraphUtils(curie_map.get()) return
def load_bindings(self): self.load_core_bindings() for g in [self.graph, self.testgraph]: for k in self.namespaces.keys(): v = self.namespaces[k] g.bind(k, Namespace(v)) for k in curie_map.get().keys(): v = curie_map.get()[k] g.bind(k, Namespace(v)) return
def __init__(self): Source.__init__(self, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def _parse_curated_chem_disease(self, limit): line_counter = 0 file_path = '/'.join((self.rawdir, self.static_files['publications']['file'])) gu = GraphUtils(curie_map.get()) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match('^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row rel_id = self._get_relationship_id(evidence) chem_id = 'MESH:'+chem_id gu.addClassToGraph(self.g, chem_id, chem_label) gu.addClassToGraph(self.g, disease_id, None) if pub_id != '': pub_id = 'PMID:'+pub_id r = Reference(pub_id, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) else: pub_id = None self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id]) if not self.testMode and limit is not None and line_counter >= limit: break return
def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for g in [self.graph, self.testgraph]: # FIXME: How to devise a label for each repository? gu = GraphUtils(curie_map.get()) repo_id = 'CoriellCollection:'+collection_id repo_label = label repo_page = page gu.addIndividualToGraph( g, repo_id, repo_label, self.terms['collection']) gu.addPage(g, repo_id, repo_page) return
def declareAsOntology(self, graph): """ The file we output needs to be declared as an ontology, including it's version information. Further information will be augmented in the dataset object. :param version: :return: """ # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ; # owl:versionInfo <http://archive.monarchinitiative.org/ttl/biogrid-YYYY-MM-DD.ttl> gu = GraphUtils(curie_map.get()) ontology_file_id = 'MonarchData:'+self.name+".ttl" gu.addOntologyDeclaration(graph, ontology_file_id) # add timestamp as version info t = datetime.now() t_string = t.strftime("%Y-%m-%d-%H-%M") ontology_version = self.name+'-'+t_string archive_url = 'MonarchArchive:'+ontology_version+'.ttl' gu.addOWLVersionIRI(graph, ontology_file_id, archive_url) gu.addOWLVersionInfo(graph, ontology_file_id, ontology_version) # TODO make sure this is synced with the Dataset class return
def _map_rel_id(orphanet_rel_id): # TODO check if these ids are stable for mapping rel_id = None gu = GraphUtils(curie_map.get()) id_map = { "17949": gu.object_properties["has_phenotype"], # Disease-causing germline mutation(s) in "17955": gu.object_properties["has_phenotype"], # Disease-causing somatic mutation(s) in "17961": gu.object_properties["contributes_to"], # Major susceptibility factor in "17967": gu.object_properties["contributes_to"], # Modifying germline mutation in "17973": gu.object_properties["contributes_to"], # Modifying somatic mutation in "17979": gu.object_properties["contributes_to"], # Part of a fusion gene in "17985": gu.object_properties["contributes_to"], # Role in the phenotype of "18273": None, # Candidate gene tested in FIXME? "25972": gu.object_properties[ "has_phenotype" ], # Disease-causing germline mutation(s) (loss of function) in "25979": gu.object_properties[ "has_phenotype" ], # Disease-causing germline mutation(s) (gain of function) in } if orphanet_rel_id in id_map: rel_id = id_map[orphanet_rel_id] else: logger.error("Disease-gene association type (%s) not mapped.", orphanet_rel_id) return rel_id
def _getNode(self, curie): """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if re.match(r'^_', curie): if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: # replace the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie actually an IRI elif re.match(r'^http|^ftp', curie): node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(RDFGraph.curie_util.get_uri(curie)) # Bind prefix map to graph prefix = curie.split(':')[0] if prefix not in self.namespace_manager.namespaces(): mapped_iri = curie_map.get()[prefix] self.bind(prefix, Namespace(mapped_iri)) else: logger.error("couldn't make URI for %s", curie) return node
def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t') # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class gu.addClassToGraph(self.graph, hp_id, None) # Add the HP ID as an equivalent class gu.addEquivalentClass(self.graph, morphology_term_id, hp_id) else: logger.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def _map_rel_id(orphanet_rel_id): # TODO check if these ids are stable for mapping rel_id = None gu = GraphUtils(curie_map.get()) id_map = { # Disease-causing germline mutation(s) in '17949': gu.object_properties['has_phenotype'], # Disease-causing somatic mutation(s) in '17955': gu.object_properties['has_phenotype'], # Major susceptibility factor in '17961': gu.object_properties['contributes_to'], # Modifying germline mutation in '17967': gu.object_properties['contributes_to'], # Modifying somatic mutation in '17973': gu.object_properties['contributes_to'], # Part of a fusion gene in '17979': gu.object_properties['contributes_to'], # Role in the phenotype of '17985': gu.object_properties['contributes_to'], '18273': None, # Candidate gene tested in FIXME? # Disease-causing germline mutation(s) (loss of function) in '25979': gu.object_properties['has_phenotype'], # comma added ?!! # Disease-causing germline mutation(s) (gain of function) in '25972': gu.object_properties['has_phenotype'], } if orphanet_rel_id in id_map: rel_id = id_map[orphanet_rel_id] else: logger.error( 'Disease-gene association type (%s) not mapped.', orphanet_rel_id) return rel_id
def setUp(self): self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' self.test_set_1 = ('MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>', 'targeted mutation 1a, Wellcome Trust Sanger Institute', 'MGI:2159965', 'C57BL/6N', 'MGP', 'Wellcome Trust Sanger Institute Mouse Genetics Project', 'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray', 'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390', 'skeleton phenotype', 'MP:0000480', 'increased rib number', '1.637023E-010', '', '8.885439E-007', 'Wilcoxon rank sum test with continuity correction', 'IMPC') # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows of each file", limit) if self.version_num is None: import os logger.info("Figuring out version num for files") # probe the raw directory for the WSnumber on # the "letter.WS###" file. # this is the only one that we keep the version number on files = os.listdir(self.rawdir) letter_file = next(f for f in files if re.match(r'letter', f)) vernum = re.search(r'(WS\d+)', letter_file) self.update_wsnum_in_files(vernum.group(1)) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: g = self.testgraph else: g = self.graph self.nobnodes = True # FIXME # to hold any label for a given id self.id_label_map = {} # to hold the mappings between genotype and background self.genotype_backgrounds = {} self.extrinsic_id_to_enviro_id_hash = {} # to hold the genes variant due to a seq alt self.variant_loci_genes = {} # to hold the parts of an environment self.environment_hash = {} self.wildtype_genotypes = [] # stores the rnai_reagent to gene targets self.rnai_gene_map = {} self.process_gene_ids(limit) # self.process_gene_desc(limit) #TEC imput file is mia 2016-Mar-03 self.process_allele_phenotype(limit) self.process_rnai_phenotypes(limit) self.process_pub_xrefs(limit) self.process_feature_loc(limit) self.process_disease_association(limit) # TODO add this when when complete # self.process_gene_interaction(limit) logger.info("Finished parsing.") self.load_bindings() gu = GraphUtils(curie_map.get()) gu.loadAllProperties(g) gu.loadObjectProperties(g, Genotype.object_properties) logger.info("Found %d nodes in graph", len(self.graph)) logger.info("Found %d nodes in testgraph", len(self.testgraph)) return
def _get_phenotypicseries_parents(entry, g): """ Extract the phenotypic series parent relationship out of the entry :param entry: :return: """ gu = GraphUtils(curie_map.get()) omimid = 'OMIM:'+str(entry['mimNumber']) # the phenotypic series mappings serieslist = [] if 'phenotypicSeriesExists' in entry: if entry['phenotypicSeriesExists'] is True: if 'phenotypeMapList' in entry: phenolist = entry['phenotypeMapList'] for p in phenolist: serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber']) if 'geneMap' in entry and 'phenotypeMapList' in entry['geneMap']: phenolist = entry['geneMap']['phenotypeMapList'] for p in phenolist: if 'phenotypicSeriesNumber' in p['phenotypeMap']: serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber']) # add this entry as a subclass of the series entry for ser in serieslist: series_id = 'OMIM:'+ser gu.addClassToGraph(g, series_id, None) gu.addSubclass(g, series_id, omimid) return
def _process_phenotypicseries(self, limit): """ Creates classes from the OMIM phenotypic series list. These are grouping classes to hook the more granular OMIM diseases. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph logger.info("getting phenotypic series titles") gu = GraphUtils(curie_map.get()) line_counter = 0 start = False with open('/'.join((self.rawdir, self.files['phenotypicSeries']['file']))) as f: for line in f: # there's several lines of header in the file, so need to skip several lines: if not start: if re.match('Phenotypic Series', line): start = True continue if re.match('\w*$', line): # skip blank lines continue line = line.strip() line_counter += 1 (ps_label, ps_num) = line.split('\t') omim_id = 'OMIM:'+ps_num gu.addClassToGraph(g, omim_id, ps_label) return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) # add the two genes gu.addClassToGraph(g, gene_id, None) gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene gu.addSynonym(g, gene_id, discontinued_symbol) if (not self.testMode) and (limit is not None and line_counter > limit): break return
def addRefToGraph(self, g): gu = GraphUtils(curie_map.get()) n = self.short_citation if n is None: n = self.title if self.ref_url is not None: ref_uri = URIRef(self.ref_url) g.add((ref_uri, DC['title'], Literal(self.title))) g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type))) g.add((ref_uri, RDFS['label'], Literal(n))) elif self.ref_id is not None: gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type) if self.title is not None: gu.addTitle(g, self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def _get_process_allelic_variants(self, entry, g): gu = GraphUtils(curie_map.get()) geno = Genotype(g) du = DipperUtil() if entry is not None: publist = {} # to hold the entry-specific publication mentions for the allelic variants entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall('\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num), geno.object_properties['is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() gu.addIndividualToGraph(g, did, None) gu.addEquivalentClass(g, al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1 rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum gu.addXref(g, al_id, rid) gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4)) elif re.search('moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] gu.addDeprecatedIndividual(g, al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing disease models") geno = Genotype(g, self.nobnodes) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_'+'-'.join((gene_num, 'unspecified')) if self.nobnodes: vl = ':'+vl vl_label = 'some variant of '+gene_symbol geno.addAlleleOfGene(vl, gene_id) animal_id = geno.make_experimental_model_with_genotype( g, vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( self.name, animal_id, disease_id, gu.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) return
def __init__(self, are_bnodes_skized=True): super().__init__() self.are_bnodes_skized = are_bnodes_skized # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 obo_map = curie_map.get()['OBO'] self.bind('OBO', Namespace(obo_map))
def _process_genes(self, taxid, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: logger.error("Data error for file %s", raw) return (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene) = row[0:5] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[5] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:'+ensembl_gene_id if description == '': description = None gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None gu.addClassToGraph( g, gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene) if hgnc_id is not None and hgnc_id != '': gu.addEquivalentClass(g, gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if not self.testMode \ and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return
def __init__(self, are_bnodes_skized=True, identifier=None): # print("in RDFGraph with id: ", identifier) super().__init__('IOMemory', identifier) self.are_bnodes_skized = are_bnodes_skized # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 obo_map = curie_map.get()['OBO'] self.bind('OBO', Namespace(obo_map))
def __init__(self, definedby): self.cu = CurieUtil(curie_map.get()) self.gu = GraphUtils(curie_map.get()) # core parts of the association self.definedby = definedby self.sub = self.obj = self.rel = None self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.score = None self.score_type = None self.score_unit = None return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': gu.addDefinition(g, gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text gu.addDescription(g, gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph is_about = gu.getNode(gu.object_properties['is_about']) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) # add the gene, in case it hasn't before gu.addClassToGraph(g, gene_id, None) # add the publication as a NamedIndividual gu.addIndividualToGraph(g, pubmed_id, None, None) # add type publication self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id))) if not self.testMode and limit is not None and line_counter > limit: break return
def __init__(self, id, label, type, description=None): self.id = id self.label = label self.type = type self.description = description self.gu = GraphUtils(curie_map.get()) self.start = None self.stop = None self.nobnodes = True # TODO remove this before official release return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' gu = GraphUtils(curie_map.get()) with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:'+str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(mpd_strainid) gu.addIndividualToGraph(g, strain_id, strain_name, tax_id) if mpdshortname.strip() != '': gu.addSynonym(g, strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum gu.addSameIndividual(g, strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) gu.addSameIndividual(g, strain_id, reiken_id) else: if url != '': gu.addXref(g, strain_id, url, True) if vendor != '': gu.addXref( g, strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' gu.addDescription(g, strain_id, desc) # TODO make the panels as a resource collection return
def __init__(self, definedby): self.cu = CurieUtil(curie_map.get()) self.gu = GraphUtils(curie_map.get()) # core parts of the association self.definedby = definedby self.sub = self.obj = self.rel = None self.assoc_id = None self.description = None self.source = [] self.evidence = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return
def process_pub_xrefs(self, limit=None): raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing publication xrefs") line_counter = 0 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (wb_ref, xref) = row # WBPaper00000009 pmid8805<BR> # WBPaper00000011 doi10.1139/z78-244<BR> # WBPaper00000012 cgc12<BR> if self.testMode and wb_ref not in self.test_ids['pub']: continue ref_id = 'WormBase:'+wb_ref xref_id = r = None xref = re.sub(r'<BR>', '', xref) xref = xref.strip() if re.match(r'pmid', xref): xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref) r = Reference( xref_id, Reference.ref_types['journal_article']) elif re.search(r'[\(\)\<\>\[\]\s]', xref): continue elif re.match(r'doi', xref): xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip()) r = Reference(xref_id) elif re.match(r'cgc', xref): # TODO not sure what to do here with cgc xrefs continue else: # logger.debug("Other xrefs like %s", xref) continue if xref_id is not None: r.addRefToGraph(g) gu.addSameIndividual(g, ref_id, xref_id) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']: continue # FIXME: What's the proper route for this? # The orthology class is essentially a KEGG gene ID that is species agnostic. # Add the ID and label as a class. Would it be considered a gene as well? other_labels = re.split(';', orthology_class_name) orthology_label = other_labels[0] # the first one is the label we'll use orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: gu.addSynonym(g, orthology_class_id, s) # add the last one as the description gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1]) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with ortholog classes") return
class RDFGraph(DipperGraph, ConjunctiveGraph): """ Extends RDFLibs ConjunctiveGraph The goal of this class is wrap the creation of triples and manage creation of URIRef, Bnodes, and literals from an input curie """ curie_map = curie_map_class.get() curie_util = CurieUtil(curie_map) # make global translation table available outside the ingest with open( os.path.join( os.path.dirname(__file__), '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle: globaltt = yaml.safe_load(fhandle) globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None): # print("in RDFGraph with id: ", identifier) super().__init__('IOMemory', identifier) self.are_bnodes_skized = are_bnodes_skized self.prefixes = set() # Can be removed when this is resolved # https://github.com/RDFLib/rdflib/issues/632 # 2020 oct. possibly fixed # for pfx in ('OBO',): # , 'ORPHA'): # self.bind(pfx, Namespace(self.curie_map[pfx])) def _make_category_triple(self, subject, category, predicate=blv.terms['category']): """ add a triple to capture subject or object category (in CURIE form) that was passed to addTriple() """ try: self.add((self._getnode(subject), self._getnode(predicate), self._getnode(category))) except: LOG.warning( "Problem adding triple in _makeCategoryTriple for " + \ "subj: %s pred: %s obj(category): %s", subject, predicate, category) def _is_literal(self, thing): """ make inference on type (literal or CURIE) return: logical """ if self.curie_regexp.match(thing) is not None or\ thing.split(':')[0].lower() in ('http', 'https', 'ftp'): object_is_literal = False else: object_is_literal = True return object_is_literal def addTriple(self, subject_id, predicate_id, obj, object_is_literal=None, literal_type=None, subject_category=None, object_category=None): if object_is_literal is None: object_is_literal = self._is_literal(obj) # add triples for subject category info if subject_category is not None: self._make_category_triple(subject_id, subject_category) # add triples for obj category info, if obj is not a literal if not object_is_literal: if object_category is not None: self._make_category_triple(obj, object_category) else: # emit warning if object category is given for a literal if object_category is not None: LOG.warning( "I was given a category %s for obj: %s, " + "which seems to be a literal!", object_category, obj) if object_is_literal is True: if isinstance(obj, str): re.sub(r'[\t\n\r\f\v]+', ' ', obj) # reduce any ws to a space if literal_type is not None and obj is not None and obj not in ( "", " "): literal_type_iri = self._getnode(literal_type) self.add( (self._getnode(subject_id), self._getnode(predicate_id), Literal(obj, datatype=literal_type_iri))) elif obj is not None: # could attempt to infer a type here but there is no use case self.add( (self._getnode(subject_id), self._getnode(predicate_id), Literal(obj))) else: LOG.warning("None as literal object for subj: %s and pred: %s", subject_id, predicate_id) # get a sense of where the None is comming from # magic number here is "steps up the call stack" # TODO there may be easier/ideomatic ways to do this now for call in range(2, 0, -1): LOG.warning('\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name) elif obj is not None and obj != '': # object is a resource self.add((self._getnode(subject_id), self._getnode(predicate_id), self._getnode(obj))) else: LOG.warning("None/empty object IRI for subj: %s and pred: %s", subject_id, predicate_id) def skolemizeBlankNode(self, curie): stripped_id = re.sub(r'^_:|^_', '', curie, 1) return URIRef(self.curie_map['BNODE'] + stripped_id) def _getnode(self, curie): """ This is a wrapper for creating a URIRef or Bnode object with a given a curie or iri as a string. If an id starts with an underscore, it assigns it to a BNode, otherwise it creates it with a standard URIRef. Alternatively, self.skolemize_blank_node is True, it will skolemize the blank node :param curie: str identifier formatted as curie or iri :return: node: RDFLib URIRef or BNode object """ node = None if curie[0] == '_': if self.are_bnodes_skized: node = self.skolemizeBlankNode(curie) else: # delete the leading underscore to make it cleaner node = BNode(re.sub(r'^_:|^_', '', curie, 1)) # Check if curie string is actually an IRI elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc': node = URIRef(curie) else: iri = RDFGraph.curie_util.get_uri(curie) if iri is not None: node = URIRef(iri) # Bind prefix map to graph prefix = curie.split(':')[0] self.prefixes.add(prefix) else: LOG.error("couldn't make URI for %s", curie) # get a sense of where the CURIE-ish? thing is comming from # magic number here is "steps up the call stack" for call in range(3, 0, -1): LOG.warning('\t%sfrom: %s', '\t' * call, sys._getframe(call).f_code.co_name) return node def bind_all_namespaces(self): """ Results in the RDF @prefix directives for every ingest being added to this ingest. """ for prefix in self.curie_map.keys(): iri = self.curie_map[prefix] self.bind(prefix, Namespace(iri)) # serialize() conflicts between rdflib & Graph.serialize abstractmethod # GraphUtils expects the former. (too bad there is no multiple dispatch) # rdflib version def serialize(self, destination=None, format='turtle', base=None, encoding=None): for prefix in self.prefixes: mapped_iri = self.curie_map[prefix] self.bind(prefix, Namespace(mapped_iri)) return ConjunctiveGraph.serialize(self, destination, format)
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() cu = CurieUtil(curie_map.get()) trait_id = cu.get_curie(trait) dis_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000408 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(dis_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'DOID:4') phenotype_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000651 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference(g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc(g, self.name, variant_id, trait_id, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph()
class StreamedGraph(DipperGraph): """ Stream rdf triples to file or stdout Assumes a downstream process will sort then uniquify triples Theoretically could support both ntriple, rdfxml formats, for now just support nt """ curie_map = curimap.get() curie_util = CurieUtil(curie_map) with open( os.path.join( os.path.dirname(__file__), '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle: globaltt = yaml.safe_load(fhandle).copy() globaltcid = {v: k for k, v in globaltt.items()} def __init__(self, are_bnodes_skized=True, identifier=None, file_handle=None, fmt='nt'): self.are_bnodes_skized = are_bnodes_skized self.fmt = fmt self.file_handle = file_handle self.identifier = identifier def addTriple(self, subject_id, predicate_id, obj, object_is_literal=None, literal_type=None): # trying making infrence on type of object if none is supplied if object_is_literal is None: if self.curie_regexp.match(obj) or\ obj.split(':')[0].lower() in ('http', 'https', 'ftp'): object_is_literal = False else: object_is_literal = True subject_iri = self._getnode(subject_id) predicate_iri = self._getnode(predicate_id) if not object_is_literal: obj = self._getnode(obj) if literal_type is not None: literal_type = self._getnode(literal_type) if obj is not None: self.serialize(subject_iri, predicate_iri, obj, object_is_literal, literal_type) else: LOG.warning("Null value passed as object") return def skolemizeBlankNode(self, curie): base_iri = StreamedGraph.curie_map.get_base() curie_id = curie.split(':')[1] skolem_iri = "{0}.wellknown/genid/{1}".format(base_iri, curie_id) return skolem_iri def serialize(self, subject_iri, predicate_iri, obj, object_is_literal=False, literal_type=None): if not object_is_literal: triple = "<{}> <{}> <{}> .".format(subject_iri, predicate_iri, obj) elif literal_type is not None: triple = '<{}> <{}> {}^^<{}> .'.format( subject_iri, predicate_iri, self._quote_encode(str(obj)), literal_type) else: if isinstance(obj, str): triple = '<{}> <{}> {} .'.format(subject_iri, predicate_iri, self._quote_encode(obj)) else: lit_type = self._getLiteralXSDType(obj) if type is not None: triple = '<{}> <{}> "{}"^^<{}> .'.format( subject_iri, predicate_iri, obj, lit_type) else: raise TypeError("Cannot determine type of {}".format(obj)) if self.file_handle is None: print(triple) else: self.file_handle.write("{}\n".format(triple)) def _getnode(self, curie): """ Returns IRI, or blank node curie/iri depending on self.skolemize_blank_node setting :param curie: str id as curie or iri :return: """ if re.match(r'^_:', curie): if self.are_bnodes_skized is True: node = self.skolemizeBlankNode(curie) else: node = curie elif re.match(r'^http|^ftp', curie): node = curie elif len(curie.split(':')) == 2: node = StreamedGraph.curie_util.get_uri(curie) else: raise TypeError("Cannot process curie {}".format(curie)) return node def _getLiteralXSDType(self, literal): """ This could be much more nuanced, but for now if a literal is not a str, determine if it's a xsd int or double :param literal: :return: str - xsd full iri """ if isinstance(literal, int): return self._getnode("xsd:integer") if isinstance(literal, float): return self._getnode("xsd:double") @staticmethod def _quote_encode(literal): """ Copy of code in rdflib here: https://github.com/RDFLib/rdflib/blob/776b90be/ rdflib/plugins/serializers/nt.py#L76 :param literal: :return: """ return '"%s"' % literal.replace('\\', '\\\\')\ .replace('\n', '\\n')\ .replace('"', '\\"')\ .replace('\r', '\\r')
def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph)
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get()