def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info( "Processing Monarch OMIA Animal disease-phenotype associations") # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f)] for f in file_list: logger.info("Processing %s", f) print(f) line_counter = 0 count_missing = 0 bad_rows = list() fname = '/'.join((mypath, f)) with open(fname, 'r') as csvfile: filereader = csv.reader( csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter <= 1: continue # skip header if len(row) != 22: logger.info("Not enough cols (%d) in %s - please fix", len(row), f) continue (disease_num, species_id, breed_name, variant, inheritance, phenotype_id, phenotype_name, entity_id, entity_name, quality_id, quality_name, related_entity_id, related_entity_name, abnormal_id, abnormal_name, phenotype_description, assay, frequency, pubmed_id, pub_description, curator_notes, date_created) = row if phenotype_id == '': # logger.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:'+disease_num.strip() species_id = species_id.strip() if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(g, self.name, disease_id, phenotype_id) if pubmed_id != '': for p in re.split(r'[,;]', pubmed_id): pmid = 'PMID:'+p.strip() assoc.add_source(pmid) else: assoc.add_source( '/'.join(('http://omia.angis.org.au/OMIA' + disease_num.strip(), species_id.strip()))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription( aid, breed_name.strip()+' [observed in]') if assay != '': model.addDescription(aid, assay.strip()+' [assay]') if curator_notes != '': model.addComment(aid, curator_notes.strip()) if entity_id != '' or quality_id != '': logger.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: logger.warning( "You are missing %d/%d D2P annotations from id %s", count_missing, line_counter-1, f) # TODO PYLINT Used builtin function 'map'. # Using a list comprehension can be clearer. logger.warning("Bad rows:\n"+"\n".join(map(str, bad_rows))) # finish loop through all files return
def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(g) model = Model(g) eco_id = "ECO:0000059" # experimental_phenotypic_evidence strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = Genotype.genoparts['sex_qualified_genotype'] if sex == 'm': genotype_type = Genotype.genoparts['male_genotype'] elif sex == 'f': genotype_type = Genotype.genoparts['female_genotype'] # add the genotype to strain connection geno.addGenotype( genotype_id, genotype_label, Genotype.genoparts['genomic_background']) g.addTriple( strain_id, Genotype.object_properties['has_genotype'], genotype_id) geno.addGenotype( sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype g.addTriple( sex_specific_genotype_id, Genotype.object_properties['has_sex_agnostic_genotype_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc( g, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) return
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Monarch OMIA Animal disease-phenotype associations") src_key = 'omia_d2p' # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f)] col = self.files[src_key]['columns'] # reusable initial code generator # for c in col: # print( # '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()") for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') fileheader = next(filereader) if fileheader != col: LOG.error('Expected %s to have columns: %s', fname, col) LOG.error('But Found %s to have columns: %s', fname, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: if len(row) != len(col): LOG.info( "Not enough cols %d in %s - please fix", len(row), filename) continue disease_num = row[col.index('Disease ID')].strip() species_id = row[col.index('Species ID')].strip() breed_name = row[col.index('Breed Name')].strip() # variant = row[col.index('Variant')] # inheritance = row[col.index('Inheritance')] phenotype_id = row[col.index('Phenotype ID')].strip() # phenotype_name = row[col.index('Phenotype Name')] entity_id = row[col.index('Entity ID')].strip() entity_name = row[col.index('Entity Name')] quality_id = row[col.index('Quality ID')].strip() quality_name = row[col.index('Quality Name')] # related_entity_id = row[col.index('Related Entity ID')] # related_entity_name = row[col.index('Related Entity Name')] # abnormal_id = row[col.index('Abnormal ID')] # abnormal_name = row[col.index('Abnormal Name')] # phenotype_desc = row[col.index('Phenotype Desc')] assay = row[col.index('Assay')].strip() # frequency = row[col.index('Frequency')] pubmed_id = row[col.index('Pubmed ID')].strip() phenotype_description = row[col.index('Pub Desc')].strip() curator_notes = row[col.index('Curator Notes')].strip() # date_created = row[col.index('Date Created')] if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for pnum in re.split(r'[,;]', pubmed_id): pnum = re.sub(r'[^0-9]', '', pnum) pmid = 'PMID:' + pnum assoc.add_source(pmid) else: assoc.add_source( '/'.join(( self.curie_map['OMIA'] + disease_num, species_id))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription(aid, breed_name + ' [observed in]') if assay != '': model.addDescription(aid, assay + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes) if entity_id != '' or quality_id != '': LOG.info( "EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num-1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return
class ModelTestCase(unittest.TestCase): def setUp(self): g = RDFGraph() self.model = Model(g) this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj_curie = "MGI:1234" self.test_cat_subj = self.cutil.get_uri("MGI:1234") self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual") self.test_label_pred = self.cutil.get_uri("rdfs:label") self.test_label = "some label" self.test_comment_IRI = self.cutil.get_uri("dcterms:comment") self.test_comment = 'bonus eruptus' def tearDown(self): self.graph = None def test_addIndividualToGraph_assign_label(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label") label_triple = list( self.model.graph.triples((URIRef(self.test_cat_subj), URIRef(self.test_label_pred), None))) self.assertEqual(len(label_triple), 1, "method didn't assign label") self.assertEqual(str(label_triple[0][2]), self.test_label, "method didn't assign correct label") def test_addIndividualToGraph_assign_type_named_individual(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label") triples = list( self.model.graph.triples((URIRef(self.test_cat_subj), None, URIRef(self.test_named_indiv)))) self.assertEqual(len(triples), 1, "method didn't assign type as named individual") def test_addIndividualToGraph_assign_category(self): self.model.addIndividualToGraph(self.test_cat_subj_curie, "some label", ind_category=blv.terms['Genotype']) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_cat_default_pred), None))) self.assertEqual(len(triples), 1, "method didn't assign category") def test_add_comment(self): self.model.addComment(self.test_cat_subj, self.test_comment) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_comment_IRI), Literal(self.test_comment)))) self.assertEqual(len(triples), 1, "method didn't assign comment") def test_add_comment_assign_subject_category(self): self.model.addComment(self.test_cat_subj, self.test_comment, subject_category=blv.terms['Genotype']) triples = list( self.model.graph.triples( (URIRef(self.test_cat_subj), URIRef(self.test_cat_default_pred), None))) self.assertEqual(len(triples), 1, "method didn't assign category")
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info( "Processing Monarch OMIA Animal disease-phenotype associations") # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f) ] for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = next(filereader) for row in filereader: if len(row) != 22 or len(row) != len(header): LOG.info("Not enough cols %d in %s - please fix", len(row), filename) continue (disease_num, species_id, breed_name, variant, inheritance, phenotype_id, phenotype_name, entity_id, entity_name, quality_id, quality_name, related_entity_id, related_entity_name, abnormal_id, abnormal_name, phenotype_description, assay, frequency, pubmed_id, pub_description, curator_notes, date_created) = row if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num.strip() species_id = species_id.strip() if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for p in re.split(r'[,;]', pubmed_id): pmid = 'PMID:' + p.strip() assoc.add_source(pmid) else: assoc.add_source('/'.join( ('http://omia.angis.org.au/OMIA' + disease_num.strip(), species_id.strip()))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription( aid, breed_name.strip() + ' [observed in]') if assay != '': model.addDescription(aid, assay.strip() + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes.strip()) if entity_id != '' or quality_id != '': LOG.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num - 1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return