def _process_group_mpo_row(self, row): """ Make OMIA to MP associations :param row: :return: """ omia_id = 'OMIA:' + row['omia_id'] mpo_num = row['MPO_no'] mpo_id = 'MP:' + str(mpo_num).zfill(7) assoc = D2PAssoc(self.graph, self.name, omia_id, mpo_id) assoc.add_association_to_graph()
def _process_omia_group_row(self, row): model = Model(self.graph) omia_id = 'OMIA:' + row['omia_id'] if self.test_mode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] # default to general disease seems the only reasonable choice disease_id = self.globaltt['disease or disorder'] group_category = 'group_category:' + str(row['group_category']) disease_id = self.resolve(group_category, False) if disease_id == 'group_category:None': disease_id = self.globaltt['disease'] elif disease_id == group_category: LOG.info( "No disease superclass defined for %s: %s with parent %s", omia_id, group_name, group_category) disease_id = self.globaltt['disease'] else: if disease_id == self.globaltt['embryonic lethality']: # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.graph, self.name, omia_id, disease_id) assoc.add_association_to_graph() # disease_id = None model.addClassToGraph(disease_id, None, class_category=blv.terms['Disease']) if group_summary == '': group_summary = None if group_name == '': group_name = None model.addClassToGraph(omia_id, group_name, description=group_summary, class_type=disease_id) self.label_hash[omia_id] = group_name
def _process_omia_group_row(self, row): model = Model(self.g) omia_id = 'OMIA:'+row['omia_id'] if self.testMode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] disease_id = None group_category = row.get('group_category') disease_id = \ self.map_omia_group_category_to_ontology_id(group_category) if disease_id is not None: model.addClassToGraph(disease_id, None) if disease_id == 'MP:0008762': # embryonic lethal # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.g, self.name, omia_id, disease_id) assoc.add_association_to_graph() disease_id = None else: logger.info( "No disease superclass defined for %s: %s", omia_id, group_name) # default to general disease FIXME this may not be desired disease_id = 'DOID:4' if group_summary == '': group_summary = None if group_name == '': group_name = None model.addClassToGraph(omia_id, group_name, disease_id, group_summary) self.label_hash[omia_id] = group_name return
def _process_phene_row(self, row): model = Model(self.graph) phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: LOG.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:' + str(row['omia_id']) if self.test_mode and not ( # demorgan this row['gb_species_id'] in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: LOG.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:' + str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:' + gb_species_id) if sp_phene_label is None and omia_label is not None \ and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) model.addClassToGraph(sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control' ]: if row[item] is not None and row[item] != '': model.addDescription(sp_phene_id, row[item] + ' [' + item + ']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) model.addOWLPropertyClassRestriction(sp_phene_id, self.globaltt['in taxon'], species_id) # add inheritance as an association inheritance_id = None if row['inherit'] is not None and row['inherit'] in self.localtt: inheritance_id = self.resolve(row['inherit']) elif row['inherit'] is not None and row['inherit'] != '': LOG.info('Unhandled inheritance type:\t%s', row['inherit']) if inheritance_id is not None: # observable related to genetic disposition assoc = D2PAssoc(self.graph, self.name, sp_phene_id, inheritance_id, rel=self.globaltt['has disposition']) assoc.add_association_to_graph() if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id'] }
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info( "Processing Monarch OMIA Animal disease-phenotype associations") # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f)] for f in file_list: logger.info("Processing %s", f) print(f) line_counter = 0 count_missing = 0 bad_rows = list() fname = '/'.join((mypath, f)) with open(fname, 'r') as csvfile: filereader = csv.reader( csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter <= 1: continue # skip header if len(row) != 22: logger.info("Not enough cols (%d) in %s - please fix", len(row), f) continue (disease_num, species_id, breed_name, variant, inheritance, phenotype_id, phenotype_name, entity_id, entity_name, quality_id, quality_name, related_entity_id, related_entity_name, abnormal_id, abnormal_name, phenotype_description, assay, frequency, pubmed_id, pub_description, curator_notes, date_created) = row if phenotype_id == '': # logger.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:'+disease_num.strip() species_id = species_id.strip() if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(g, self.name, disease_id, phenotype_id) if pubmed_id != '': for p in re.split(r'[,;]', pubmed_id): pmid = 'PMID:'+p.strip() assoc.add_source(pmid) else: assoc.add_source( '/'.join(('http://omia.angis.org.au/OMIA' + disease_num.strip(), species_id.strip()))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription( aid, breed_name.strip()+' [observed in]') if assay != '': model.addDescription(aid, assay.strip()+' [assay]') if curator_notes != '': model.addComment(aid, curator_notes.strip()) if entity_id != '' or quality_id != '': logger.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: logger.warning( "You are missing %d/%d D2P annotations from id %s", count_missing, line_counter-1, f) # TODO PYLINT Used builtin function 'map'. # Using a list comprehension can be clearer. logger.warning("Bad rows:\n"+"\n".join(map(str, bad_rows))) # finish loop through all files return
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info( "Processing Monarch OMIA Animal disease-phenotype associations") src_key = 'omia_d2p' # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f) ] col = self.files[src_key]['columns'] # reusable initial code generator # for c in col: # print( # '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()") for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if self.check_fileheader(col, row): pass for row in filereader: if len(row) != len(col): LOG.info("Not enough cols %d in %s - please fix", len(row), filename) continue disease_num = row[col.index('Disease ID')].strip() species_id = row[col.index('Species ID')].strip() breed_name = row[col.index('Breed Name')].strip() # variant = row[col.index('Variant')] # inheritance = row[col.index('Inheritance')] phenotype_id = row[col.index('Phenotype ID')].strip() # phenotype_name = row[col.index('Phenotype Name')] entity_id = row[col.index('Entity ID')].strip() entity_name = row[col.index('Entity Name')] quality_id = row[col.index('Quality ID')].strip() quality_name = row[col.index('Quality Name')] # related_entity_id = row[col.index('Related Entity ID')] # related_entity_name = row[col.index('Related Entity Name')] # abnormal_id = row[col.index('Abnormal ID')] # abnormal_name = row[col.index('Abnormal Name')] # phenotype_desc = row[col.index('Phenotype Desc')] assay = row[col.index('Assay')].strip() # frequency = row[col.index('Frequency')] pubmed_id = row[col.index('Pubmed ID')].strip() phenotype_description = row[col.index('Pub Desc')].strip() curator_notes = row[col.index('Curator Notes')].strip() # date_created = row[col.index('Date Created')] if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for pnum in re.split(r'[,;]', pubmed_id): pnum = re.sub(r'[^0-9]', '', pnum) pmid = 'PMID:' + pnum assoc.add_source(pmid) else: assoc.add_source('/'.join( (self.curie_map['OMIA'] + disease_num, species_id))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription(aid, breed_name + ' [observed in]') if assay != '': model.addDescription(aid, assay + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes) if entity_id != '' or quality_id != '': LOG.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num - 1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return
def process_common_disease_file(self, raw, unpadded_doids, limit=None): """ Make disaese-phenotype associations. Some identifiers need clean up: * DOIDs are listed as DOID-DOID: --> DOID: * DOIDs may be unnecessarily zero-padded. these are remapped to their non-padded equivalent. :param raw: :param unpadded_doids: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph assoc_count = 0 replace_id_flag = False col = self.small_files['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') header = tsvfile.readline() if header != col: LOG.error("HEADER: has changed in %s.", raw) raise ValueError(col - header) disease_id = None for row in reader: row = [str(x).strip() for x in row] did = row[col.index('Disease ID')] # genotype = row[col.index('Genotype')] phenotype_id = row[col.index('Phenotype ID')] age_of_onset_id = row[col.index('Age of Onset ID')] eid = row[col.index('Evidence ID')] frequency = row[col.index('Frequency')] negation_id = row[col.index('Negation ID')] description = row[col.index('Description')] pub_ids = row[col.index('Pub')] disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did) disease_id = re.sub(r'MESH-', 'MESH:', disease_id) if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id): LOG.warning("Invalid id format: %s", disease_id) # figure out if the doid should be unpadded, # then use the unpadded version instead if re.match(r'DOID', disease_id): unpadded_num = re.sub(r'DOID:', '', disease_id) unpadded_num = unpadded_num.lstrip('0') if unpadded_num in unpadded_doids: fixed_id = 'DOID:' + unpadded_num replace_id_flag = True disease_id = fixed_id.strip() if self.test_mode and disease_id not in self.test_ids: # since these are broken up into disease-by-disease, # just skip the whole file return 0 if negation_id != '': continue # TODO add negative associations if disease_id != '' and phenotype_id != '': assoc = D2PAssoc( graph, self.name, disease_id, phenotype_id.strip()) if age_of_onset_id != '': assoc.onset = age_of_onset_id if frequency != '': assoc.frequency = frequency eco_id = self.localtt[eid] if eco_id is None: eco_id = self.localtt['ITM'] assoc.add_evidence(eco_id) # TODO add sex? - not in dataset yet if description != '': assoc.set_description(description) if pub_ids != '': for pub in pub_ids.split(';'): pub = re.sub(r' *', '', pub) # fixed now but just in case # there have been several malformed PMIDs curies if pub[:4] != 'http' and \ graph.curie_regexp.fullmatch(pub) is None: LOG.warning( 'Record %s has a malformed Pub %s', did, pub) continue if re.search( r'(DOID|MESH)', pub) or re.search( r'Disease name contained', description): # skip "pubs" that are derived from # the classes themselves continue assoc.add_source(pub.strip()) # TODO assigned by? assoc.add_association_to_graph() assoc_count += 1 if not self.test_mode and limit is not None\ and reader.line_num > limit: break if replace_id_flag: LOG.info("replaced DOID with unpadded version") self.replaced_id_count += 1 LOG.info( "Added %d associations for %s.", assoc_count, disease_id) return assoc_count
def _process_phenotype_hpoa(self, file_info, limit=None): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ src_key = 'hpoa' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, file_info['file'])) # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned col = self.files[src_key]['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') row = next(reader) # drop Description row = str(next(reader))[9:19] LOG.info("Ingest from %s", row) date = datetime.strptime( row.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M") if file_info.get("url") is not None: self.dataset.set_ingest_source_file_version_date( file_info.get("url"), date) row = next(reader) # drop tracker url row = next(reader) # drop release url row = next(reader) # headers # row[0] = row[0][1:] # uncomment; but not allways needed ?! if not self.check_fileheader(col, row): pass for row in reader: row = [str(col).strip() for col in row] disease_id = row[col.index('#DatabaseID')] # 98246 OMIM # 68646 ORPHA # 297 DECIPHER if self.test_mode: try: id_list = self.test_ids if id_list is None or disease_id not in id_list: continue except AttributeError: continue # row[col.index('DiseaseName')] unused if row[col.index('Qualifier')] == 'NOT': continue hpo_id = row[col.index('HPO_ID')] publist = row[col.index('Reference')] eco_id = self.resolve(row[col.index('Evidence')]) onset = row[col.index('Onset')] freq = row[col.index('Frequency')] sex = row[col.index('Sex')].lower() # row[col.index('Modifier')] unused asp = row[col.index('Aspect')] # row[col.index('Biocuration')] unused # LOG.info( # 'adding <%s>-to-<%s> because <%s>', disease_id, hpo_id, eco_id) model.addClassToGraph(disease_id) model.addClassToGraph(eco_id) if onset is not None and onset != '': model.addClassToGraph(onset) if asp in ('P', 'M'): # phenotype? abnormality or mortality model.addClassToGraph(hpo_id) assoc = D2PAssoc( # default rel=self.globaltt['has phenotype'] graph, self.name, disease_id, hpo_id, onset, freq ) elif asp in ('I', 'C'): # inheritance pattern or clinical course/onset model.addClassToGraph(hpo_id) assoc = D2PAssoc( graph, self.name, disease_id, hpo_id, rel=self.globaltt['has disposition'] ) else: LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num) assoc.add_evidence(eco_id) if sex is not None and sex != '': self.graph.addTriple( assoc.get_association_id(), self.globaltt['has_sex_specificty'], self.globaltt[sex], object_category=blv.terms['BiologicalSex'] ) # Publication # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\ # sort | uniq -c | sort -nr # 629 PMID # 63 OMIM # 42 ISBN-13 # 36 http for pub in publist.split(';'): pub = pub.strip() # there have been several malformed PMIDs if pub[:4] != 'http' and \ graph.curie_regexp.fullmatch(pub) is None: LOG.warning( 'Record %s has a malformed Reference %s', disease_id, pub) continue pubtype = None if pub[:5] == 'PMID:': pubtype = self.globaltt['journal article'] elif pub[:4] == 'ISBN': pubtype = self.globaltt['publication'] elif pub[:5] == 'OMIM:': pub = 'http://omim.org/entry/' + pub[5:] pubtype = self.globaltt['web page'] elif pub[:9] == 'DECIPHER:': pubtype = self.globaltt['web page'] elif pub[:6] == 'ORPHA:': pubtype = self.globaltt['web page'] elif pub[:4] == 'http': pubtype = self.globaltt['web page'] else: LOG.error( 'Unknown pub type for disease %s from "%s"', disease_id, pub) continue if pub is not None: assoc.add_source(pub) if pubtype is not None: ref = Reference(graph, pub, pubtype) # ref.setTitle(''); ref.setYear() ref.addRefToGraph() # TODO add curator # pprint.pprint(assoc) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break return
def process_common_disease_file(self, raw, unpadded_doids, limit=None): """ Make disaese-phenotype associations. Some identifiers need clean up: * DOIDs are listed as DOID-DOID: --> DOID: * DOIDs may be unnecessarily zero-padded. these are remapped to their non-padded equivalent. :param raw: :param unpadded_doids: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 assoc_count = 0 replace_id_flag = False with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = csvfile.readline() # skip the header row logger.info("HEADER: %s", header) disease_id = None for row in filereader: if 21 == len(row): (did, dname, gid, gene_name, genotype, gene_symbols, phenotype_id, phenotype_name, age_of_onset_id, age_of_onset_name, eid, evidence_name, frequency, sex_id, sex_name, negation_id, negation_name, description, pub_ids, assigned_by, date_created) = [str(col).strip() for col in row] else: logger.warning( "Wrong number of columns! expected 21, got: %s in: %s", len(row), raw) logger.warning("%s", row) continue # b/c "PMID: 17223397" pub_ids = re.sub(r' *', '', pub_ids) disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did) disease_id = re.sub(r'MESH-', 'MESH:', disease_id) if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id): logger.warning("Invalid id format: %s", disease_id) # figure out if the doid should be unpadded, # then use the unpadded version instead if re.match(r'DOID', disease_id): unpadded_num = re.sub(r'DOID:', '', disease_id) unpadded_num = unpadded_num.lstrip('0') if unpadded_num in unpadded_doids: fixed_id = 'DOID:' + unpadded_num replace_id_flag = True disease_id = fixed_id.strip() if self.testMode and disease_id not in self.test_ids: # since these are broken up into disease-by-disease, # just skip the whole file return 0 else: line_counter += 1 if negation_id != '': continue # TODO add negative associations if disease_id != '' and phenotype_id != '': assoc = D2PAssoc(g, self.name, disease_id, phenotype_id.strip()) if age_of_onset_id != '': assoc.onset = age_of_onset_id if frequency != '': assoc.frequency = frequency eco_id = self._map_evidence_to_codes(eid) if eco_id is None: eco_id = self._map_evidence_to_codes('ITM') assoc.add_evidence(eco_id) # TODO add sex? - not in dataset yet if description != '': assoc.set_description(description) if pub_ids != '': for p in pub_ids.split(';'): p = re.sub(r' *', '', p) if re.search(r'(DOID|MESH)', p) \ or re.search(r'Disease name contained', description): # skip "pubs" that are derived from # the classes themselves continue assoc.add_source(p.strip()) # TODO assigned by? assoc.add_association_to_graph() assoc_count += 1 if not self.testMode and limit is not None\ and line_counter > limit: break if replace_id_flag: logger.info("replaced DOID with unpadded version") self.replaced_id_count += 1 logger.info("Added %d associations for %s.", assoc_count, disease_id) return assoc_count
def _process_phenotype_tab(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 row = [str(col).strip() for col in row] (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row disease_id = db + ":" + num if self.testMode: try: id_list = self.test_ids if id_list is None \ or disease_id not in id_list: continue except AttributeError: continue # logger.info('adding %s', disease_id) model.addClassToGraph(disease_id, None) model.addClassToGraph(pheno_id, None) eco_id = self._map_evidence_to_codes(eco) model.addClassToGraph(eco_id, None) if onset is not None and onset != '': model.addClassToGraph(onset, None) # we want to do things differently depending on # the aspect of the annotation # TODO PYLINT Redefinition of assoc type from # dipper.models.assoc.D2PAssoc.D2PAssoc to # dipper.models.assoc.DispositionAssoc.DispositionAssoc if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc(g, self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc(g, self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc(g, self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is: %s", asp) assoc.add_evidence(eco_id) publist = re.split(r'[,;]', publist) # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() pubtype = None if pub != '': # if re.match( # r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', # pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search(r'part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search( # r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', # pub): # m = re.search(r'Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub): if re.match(r'PMID', pub): pubtype = \ Reference.ref_types['journal_article'] elif re.match(r'HPO', pub): pubtype = Reference.ref_types['person'] else: pubtype = Reference.ref_types['publication'] r = Reference(g, pub, pubtype) r.addRefToGraph() elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub): # make the pubs a reference to the website, # instead of the curie if re.match(r'OMIM', pub): omimnum = re.sub(r'OMIM:', '', pub) omimurl = '/'.join(('http://omim.org/entry', str(omimnum).strip())) pub = omimurl elif re.match(r'Orphanet:', pub): orphanetnum = re.sub(r'Orphanet:', '', pub) orphaneturl = \ ''.join(( 'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=', str(orphanetnum))) pub = orphaneturl elif re.match(r'DECIPHER:', pub): deciphernum = re.sub(r'DECIPHER:', '', pub) decipherurl = '/'.join( ('https://decipher.sanger.ac.uk/syndrome', deciphernum)) pub = decipherurl pubtype = Reference.ref_types['webpage'] elif re.match(r'http', pub): pass else: logger.error('Unknown pub type for %s: %s', disease_id, pub) print(disease_id, 'pubs:', str(publist)) continue if pub is not None: assoc.add_source(pub) # TODO add curator assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_phenotype_hpoa(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filedate = datetime.utcfromtimestamp( os.stat(raw)[ST_CTIME]).strftime("%Y-%m-%d") # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned col = self.files['hpoa']['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') vers = next(reader) # drop vers = str(next(reader))[9:19] print(vers) date = datetime.strptime(vers.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M") self.dataset.setVersion(filedate, date) for row in reader: if row[0][0] == '#' or row[0] == 'DatabaseID': # headers continue row = [str(col).strip() for col in row] disease_id = row[col.index('DatabaseID')] # 98246 OMIM # 68646 ORPHA # 297 DECIPHER if self.test_mode: try: id_list = self.test_ids if id_list is None or disease_id not in id_list: continue except AttributeError: continue pheno_id = row[col.index('HPO_ID')] eco_id = self.resolve(row[col.index('Evidence')]) onset = row[col.index('Onset')] asp = row[col.index('Aspect')] freq = row[col.index('Frequency')] publist = row[col.index('Reference')] sex = row[col.index('Sex')].lower() # LOG.info( # 'adding <%s>-to-<%s> because <%s>', disease_id, pheno_id, eco_id) model.addClassToGraph(disease_id) model.addClassToGraph(pheno_id) model.addClassToGraph(eco_id) if onset is not None and onset != '': model.addClassToGraph(onset) if asp in ('P', 'M'): # phenotype? abnormality or mortality assoc = D2PAssoc( # default rel=self.globaltt['has phenotype'] graph, self.name, disease_id, pheno_id, onset, freq) elif asp in ( 'I', 'C'): # inheritance pattern or clinical course/onset assoc = D2PAssoc(graph, self.name, disease_id, pheno_id, rel=self.globaltt['has disposition']) else: LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num) assoc.add_evidence(eco_id) if sex is not None and sex != '': self.graph.addTriple(assoc.get_association_id(), self.globaltt['has_sex_specificty'], self.globaltt[sex]) # Publication # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\ # sort | uniq -c | sort -nr # 629 PMID # 63 OMIM # 42 ISBN-13 # 36 http for pub in publist.split(';'): pub = pub.strip() pubtype = None if pub[:5] == 'PMID:': pubtype = self.globaltt['journal article'] elif pub[:4] == 'ISBN': pubtype = self.globaltt['publication'] elif pub[:5] == 'OMIM:': pub = 'http://omim.org/entry/' + pub[5:] pubtype = self.globaltt['web page'] elif pub[:9] == 'DECIPHER:': pubtype = self.globaltt['web page'] elif pub[:6] == 'ORPHA:': pubtype = self.globaltt['web page'] elif pub[:4] == 'http': pubtype = self.globaltt['web page'] else: LOG.error('Unknown pub type for disease %s from "%s"', disease_id, pub) continue if pub is not None: assoc.add_source(pub) if pubtype is not None: ref = Reference(graph, pub, pubtype) # ref.setTitle(''); ref.setYear() ref.addRefToGraph() # TODO add curator # pprint.pprint(assoc) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break return