def addRefToGraph(self, g): gu = GraphUtils(curie_map.get()) n = self.short_citation if n is None: n = self.title if self.ref_url is not None: ref_uri = URIRef(self.ref_url) g.add((ref_uri, DC['title'], Literal(self.title))) g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type))) g.add((ref_uri, RDFS['label'], Literal(n))) elif self.ref_id is not None: gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type) if self.title is not None: gu.addTitle(g, self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for g in [self.graph, self.testgraph]: # FIXME: How to devise a label for each repository? gu = GraphUtils(curie_map.get()) repo_id = 'CoriellCollection:'+collection_id repo_label = label repo_page = page gu.addIndividualToGraph( g, repo_id, repo_label, self.terms['collection']) gu.addPage(g, repo_id, repo_page) return
def _get_process_allelic_variants(self, entry, g): gu = GraphUtils(curie_map.get()) geno = Genotype(g) du = DipperUtil() if entry is not None: publist = {} # to hold the entry-specific publication mentions for the allelic variants entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall('\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num), geno.object_properties['is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() gu.addIndividualToGraph(g, did, None) gu.addEquivalentClass(g, al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1 rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum gu.addXref(g, al_id, rid) gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4)) elif re.search('moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] gu.addDeprecatedIndividual(g, al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph is_about = gu.getNode(gu.object_properties['is_about']) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) # add the gene, in case it hasn't before gu.addClassToGraph(g, gene_id, None) # add the publication as a NamedIndividual gu.addIndividualToGraph(g, pubmed_id, None, None) # add type publication self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id))) if not self.testMode and limit is not None and line_counter > limit: break return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' gu = GraphUtils(curie_map.get()) with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:'+str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(mpd_strainid) gu.addIndividualToGraph(g, strain_id, strain_name, tax_id) if mpdshortname.strip() != '': gu.addSynonym(g, strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum gu.addSameIndividual(g, strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) gu.addSameIndividual(g, strain_id, reiken_id) else: if url != '': gu.addXref(g, strain_id, url, True) if vendor != '': gu.addXref( g, strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' gu.addDescription(g, strain_id, desc) # TODO make the panels as a resource collection return
def addRefToGraph(self, g): gu = GraphUtils(curie_map.get()) n = self.short_citation if n is None: n = self.title gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type) if self.title is not None: gu.addTitle(g, self.ref_id, self.title) # todo what is the property here to add the date? #if self.year is not None: # gu.addTriple() #if self.author_list is not None: # for a in self.author_list: # gu.addTriple(g, self.ref_id, self.props['has_author'], a, True) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ object_properties = { 'location': 'faldo:location', 'begin': 'faldo:begin', 'end': 'faldo:end', 'reference': 'faldo:reference', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205', 'is_about': 'IAO:00000136', 'has_subsequence': 'RO:0002524', 'is_subsequence_of': 'RO:0002525', 'has_staining_intensity': 'GENO:0000207', # was GENO:0000626 (staining_intensity), # but changing to has_sequence_attribute 'upstream_of_sequence_of': 'RO:0002528', 'downstream_of_sequence_of': 'RO:0002529' } data_properties = { 'position': 'faldo:position', } annotation_properties = {} properties = object_properties.copy() properties.update(data_properties) properties.update(annotation_properties) types = { 'region': 'faldo:Region', 'Position': 'faldo:Position', # big P for Position type. little p for position property 'FuzzyPosition': 'faldo:FuzzyPosition', 'chromosome': 'SO:0000340', 'chromosome_arm': 'SO:0000105', 'chromosome_band': 'SO:0000341', 'chromosome_part': 'SO:0000830', 'long_chromosome_arm': 'GENO:0000629', 'short_chromosome_arm': 'GENO:0000628', 'chromosome_region': 'GENO:0000614', 'chromosome_subband': 'GENO:0000616', 'centromere': 'SO:0000577', 'plus_strand': 'faldo:PlusStrandPosition', 'minus_strand': 'faldo:MinusStrandPosition', 'both_strand': 'faldo:BothStrandPosition', 'score': 'SO:0001685', # FIXME - score is not a good solution, too generic 'reference_genome': 'SO:0001505', 'genome': 'SO:0001026', 'assembly_component': 'SO:0000143', 'SNP': 'SO:0000694', # the following are sequence attributes: 'band_intensity': 'GENO:0000618', 'gneg': 'GENO:0000620', 'gpos': 'GENO:0000619', 'gpos100': 'GENO:0000622', 'gpos75': 'GENO:0000623', 'gpos50': 'GENO:0000624', 'gpos25': 'GENO:0000625', 'gvar': 'GENO:0000621', 'gpos33': 'GENO:0000633', 'gpos66': 'GENO:0000632' } def __init__(self, id, label, type, description=None): self.id = id self.label = label self.type = type self.description = description self.gu = GraphUtils(curie_map.get()) self.start = None self.stop = None self.nobnodes = True # TODO remove this before official release return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.types['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.types['plus_strand'] elif strand == '-': strand_id = self.types['minus_strand'] elif strand == '.': strand_id = self.types['both_strand'] elif strand is None: # assume this is Unknown pass else: logger.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, graph, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.gu.addClassToGraph(graph, self.id, self.label, self.type, self.description) else: self.gu.addIndividualToGraph(graph, self.id, self.label, self.type, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and \ self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes( self.start['type']) if self.stop is not None and\ self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_'+rid+"-Region" region_id = rid if self.nobnodes: region_id = ':'+region_id self.gu.addTriple(graph, self.id, self.properties['location'], region_id) self.gu.addIndividualToGraph( graph, region_id, None, 'faldo:Region') else: region_id = self.id self.gu.addType(graph, region_id, 'faldo:Region') # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId(self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph(graph, self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph(graph, self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(graph, region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.types['plus_strand'] in tylist: strand = 'plus' elif self.types['minus_strand'] in tylist: strand = 'minus' elif self.types['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: logger.error("Trying to make position with no reference.") return None i = '_' if self.nobnodes: i = ':'+i reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) i += reference if coordinate is not None: # just in case it isn't a string already i = '-'.join((i, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: i = '-'.join((i, tstring)) return i def addRegionPositionToGraph( self, graph, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # logger.warn( # "No begin position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['begin'], begin_position_id) if end_position_id is None: pass # logger.warn("No end position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['end'], end_position_id) return def addPositionToGraph( self, graph, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ iid = self._makePositionId(reference_id, position, position_types) n = self.gu.getNode(iid) pos = self.gu.getNode(self.properties['position']) ref = self.gu.getNode(self.properties['reference']) if position is not None: graph.add((n, pos, Literal(position, datatype=XSD['integer']))) graph.add((n, ref, self.gu.getNode(reference_id))) if position_types is not None: for t in position_types: graph.add((n, RDF['type'], self.gu.getNode(t))) s = None if strand is not None: s = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it s = self._getStrandType(strand) # else: # s = self.types['both_strand'] if s is None and (position_types is None or position_types == []): s = self.types['Position'] if s is not None: graph.add((n, RDF['type'], self.gu.getNode(s))) return iid def addSubsequenceOfFeature(self, graph, parentid): """ This will add reciprocal triples like: feature is_subsequence_of parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.gu.addTriple( graph, self.id, self.properties['is_subsequence_of'], parentid) self.gu.addTriple( graph, parentid, self.properties['has_subsequence'], self.id) return def addTaxonToFeature(self, graph, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ # TEC: should taxon be set in __init__()? self.taxon = taxonid self.gu.addTriple( graph, self.id, Assoc.properties['in_taxon'], self.taxon) return def loadAllProperties(self, graph): prop_dict = { Assoc(None).ANNOTPROP: self.annotation_properties, Assoc(None).OBJECTPROP: self.object_properties, Assoc(None).DATAPROP: self.data_properties } for p in prop_dict: self.gu.loadProperties(graph, prop_dict.get(p), p) return def addFeatureProperty(self, graph, property_type, property): self.gu.addTriple(graph, self.id, property_type, property) return def setNoBNodes(self, nobnodes): self.nobnodes = nobnodes return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ # special genotype parts mapped to their GENO and SO classes that we explicitly reference here genoparts = { 'intrinsic_genotype': 'GENO:0000000', 'extrinsic_genotype': 'GENO:0000524', 'effective_genotype': 'GENO:0000525', 'genomic_background': 'GENO:0000611', 'genomic_variation_complement': 'GENO:0000009', 'karyotype_variation_complement': 'GENO:0000644', 'variant_single_locus_complement': 'GENO:0000030', 'variant_locus': 'GENO:0000002', 'reference_locus': 'GENO:0000036', 'allele': 'GENO:0000008', 'gene': 'SO:0000704', 'QTL': 'SO:0000771', 'transgene': 'SO:0000902', 'pseudogene': 'SO:0000336', 'cytogenetic marker': 'SO:0000341', 'sequence_feature': 'SO:0000110', 'sequence_alteration': 'SO:0001059', 'insertion': 'SO:0000667', 'deletion': 'SO:0000159', 'substitution': 'SO:1000002', 'duplication': 'SO:1000035', 'translocation': 'SO:0000199', 'inversion': 'SO:1000036', 'tandem_duplication': 'SO:1000173', 'point_mutation': 'SO:1000008', 'population': 'PCO:0000001', # population 'family': 'PCO:0000020', # family 'wildtype': 'GENO:0000511', 'reagent_targeted_gene': 'GENO:0000504', 'targeted_gene_subregion' : 'GENO:0000534', 'targeted_gene_complement' : 'GENO:0000527', 'biological_region' : 'SO:0001411', 'missense_variant': 'SO:0001583', 'transcript': 'SO:0000233', 'polypeptide': 'SO:0000104', 'cDNA': 'SO:0000756', 'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118', 'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125', 'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120', 'sequence_variant_affecting_polypeptide_function': 'SO:1000117', 'regulatory_transgene_feature': 'GENO:0000638', 'coding_transgene_feature': 'GENO:0000637', 'protein_coding_gene': 'SO:0001217', 'ncRNA_gene': 'SO:0001263' } object_properties = { 'is_mutant_of': 'GENO:0000440', 'derives_from': 'RO:0001000', 'has_alternate_part': 'GENO:0000382', 'has_reference_part': 'GENO:0000385', 'in_taxon': 'RO:0002162', 'has_zygosity': 'GENO:0000608', 'is_sequence_variant_instance_of': 'GENO:0000408', # links a alternate locus (instance) to a gene (class) 'targets_instance_of': 'GENO:0000414', 'is_reference_instance_of': 'GENO:0000610', 'has_part': 'BFO:0000051', 'has_member_with_allelotype': 'GENO:0000225', # use this when relating populations 'is_allelotype_of': 'GENO:0000206', 'has_genotype': 'GENO:0000222', 'has_phenotype': 'RO:0002200', 'transcribed_to': 'RO:0002205', 'translates_to': 'RO:0002513', 'is_targeted_expression_variant_of' : 'GENO:0000443', 'is_transgene_variant_of': 'GENO:0000444', 'has_expression-variant_part' : 'GENO:0000532', 'targeted_by' : 'GENO:0000634', # between a (reagent-targeted gene) and a morpholino 'derives_sequence_from_gene': 'GENO:0000639', # FIXME should this just be subsequence of? 'feature_to_gene_relation': 'GENO:0000418' } annotation_properties = { # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21 'reference_nucleotide': 'GENO:reference_nucleotide', # Made up term 'reference_amino_acid': 'GENO:reference_amino_acid', # Made up term 'altered_nucleotide': 'GENO:altered_nucleotide', # Made up term 'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' # Made up term } zygosity = { 'homoplasmic': 'GENO:0000602', 'heterozygous': 'GENO:0000135', 'indeterminate': 'GENO:0000137', 'heteroplasmic': 'GENO:0000603', 'hemizygous-y': 'GENO:0000604', 'hemizygous-x': 'GENO:0000605', 'homozygous': 'GENO:0000136', 'hemizygous': 'GENO:0000606', 'complex_heterozygous': 'GENO:0000402', 'simple_heterozygous': 'GENO:0000458' } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.genoparts['intrinsic_genotype'] self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if (allele_type is None): allele_type = self.genoparts['allele'] #TODO is this a good idea? self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description) return def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None): if gene_type is None: gene_type = self.genoparts['gene'] # genes are classes self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if (rel_id is None): rel_id = self.properties['is_sequence_variant_instance_of'] self.gu.addTriple(self.graph, allele_id, rel_id, gene_id) return def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None): """ Add gene/variant/allele transcribes_to relationship :param variant_id: :param transcript_id: :param transcript_label: :param transcript_type: :return: """ self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type) self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id) return def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.genoparts['polypeptide'] self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id) return def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 gu = self.gu vslc = gu.getNode(vslc_id) if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.zygosity['homozygous'] else: zygosity_id = self.zygosity['heterozygous'] if zygosity_id is not None: gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts(vslc_id, parent_id, self.properties['has_alternate_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.properties['has_part'] self.gu.addTriple(self.graph, parent_id, part_relationship, part_id) return def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.genoparts['sequence_alteration'] self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.properties['has_alternate_part']) return def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype(self, background_id, genotype_id): self.gu.addType(self.graph, background_id, self.genoparts['genomic_background']) self.addParts(background_id, genotype_id, self.object_properties['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ in_taxon = self.gu.getNode(self.properties['in_taxon']) s = self.gu.getNode(genopart_id) self.graph.add((s, in_taxon, self.gu.getNode(taxon_id))) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype p = self.object_properties['has_expression-variant_part'] self.gu.addTriple(self.graph, genotype_id, p, reagent_id) return def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description) self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id) return def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_instance_of <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if (targeted_gene_id is None): targeted_gene_id = '_' + gene_id + '-' + reagent_id self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label, self.genoparts['reagent_targeted_gene'], description) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['is_targeted_expression_variant_of'], gene_id) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id) return def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.genoparts['targeted_gene_subregion'] self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.gu.addTriple(self.graph, population_id, self.properties['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.genoparts['targeted_gene_complement'] self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_id, taxon_label=None): if taxon_label is None: taxon_label = taxon_id genome_label = taxon_label+' genome' genome_id = self.makeGenomeID(taxon_id) self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome']) self.gu.addType(self.graph, build_id, genome_id) self.addTaxon(taxon_id, build_id) return def makeGenomeID(self, taxon_id): # scrub off the taxon prefix. put it in base space genome_id = re.sub('.*\:', ':', taxon_id) + 'genome' return genome_id def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None): # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. # if a build is included, punn the chromosome as a subclass of SO:chromsome, and # make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the # build or genome. # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: chrinbuild_id = makeChromID(chr, build_id) # the build-specific chromosome if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, build_id, chrinbuild_id) self.gu.addMemberOf(self.graph, chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # the chrom class (generic) id chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label, Feature.types['chromosome']) return def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.gu.addType(self.graph, chr_id, chr_type) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, reference_id, chr_id) self.gu.addMemberOf(self.graph, chr_id, reference_id) return def make_variant_locus_label(self, gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip()+'<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if (gene_label is None and allele1_label is None and allele2_label is None): logger.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology gu.addClassToGraph(g, disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph( g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): gu.addSynonym(g, gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) if self.nobnodes: alt_locus_id = ':'+alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ logger.info("Processing OMIM to KEGG gene") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.testMode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-'+kegg_gene_id.strip() omim_id = re.sub('omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! so add them as a class then make equivalence gu.addClassToGraph(g, omim_id, None) geno.addGene(kegg_gene_id, None) gu.addEquivalentClass(g, kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID and the KEGG gene ID # we do this with omim ids because they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, kegg_gene_id) # Add the disease to gene relationship. rel = gu.object_properties['is_marker_for'] assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph(g) elif link_type == 'original': # these are sometimes a gene, and sometimes a disease logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with OMIM to KEGG gene") gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) return
class Decipher(Source): """ The Decipher group curates and assembles the Development Disorder Genotype Phenotype Database (DDG2P) which is a curated list of genes reported to be associated with developmental disorders, compiled by clinicians as part of the DDD study to facilitate clinical feedback of likely causal variants. Beware that the redistribution of this data is a bit unclear from the [license](https://decipher.sanger.ac.uk/legal). If you intend to distribute this data, be sure to have the appropriate licenses in place. """ files = { 'annot': { 'file': 'ddg2p.zip', 'url': 'https://decipher.sanger.ac.uk/files/ddd/ddg2p.zip'} } def __init__(self): Source.__init__(self, 'decipher') self.load_bindings() self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) # since there's a dependency on HGNC files; fetch those too hgnc = HGNC() hgnc.fetch(is_dl_forced) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) logger.info("Finished parsing.") return def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.g is not None: g = self.g else: g = self.graph gu = GraphUtils(curie_map.get()) # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC() hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile( '/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: logger.error( "Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene gu.addClassToGraph(g, hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:'+str(omim.strip()) # assume this is declared elsewhere in ontology gu.addClassToGraph(g, omim_id, None) if category.strip() == 'Confirmed DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Probable DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Possible DD gene': rel = gu.object_properties['contributes_to'] elif category.strip() == 'Not DD gene': # TODO negative annotation continue assoc = G2PAssoc(self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:'+str(p) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) assoc.add_source(pmid) assoc.add_association_to_graph(g) else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways logger.warning( "No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() logger.warning( "gene-disorder associations with no omim id: %d", unmapped_omim_counter) logger.warning("unmapped gene count: %d", unmapped_gene_count) gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) return def make_allele_by_consequence(self, consequence, gene_id, gene_symbol): """ Given a "consequence" label that describes a variation type, create an anonymous variant of the specified gene as an instance of that consequence type. :param consequence: :param gene_id: :param gene_symbol: :return: allele_id """ allele_id = None # Loss of function : Nonsense, frame-shifting indel, # essential splice site mutation, whole gene deletion or any other # mutation where functional analysis demonstrates clear reduction # or loss of function # All missense/in frame : Where all the mutations described in the data # source are either missense or in frame deletions and there is no # evidence favoring either loss-of-function, activating or # dominant negative effect # Dominant negative : Mutation within one allele of a gene that creates # a significantly greater deleterious effect on gene product # function than a monoallelic loss of function mutation # Activating : Mutation, usually missense that results in # a constitutive functional activation of the gene product # Increased gene dosage : Copy number variation that increases # the functional dosage of the gene # Cis-regulatory or promotor mutation : Mutation in cis-regulatory # elements that lies outwith the known transcription unit and # promotor of the controlled gene # Uncertain : Where the exact nature of the mutation is unclear or # not recorded so_type = { # type of variant 'Loss of function': 'SO:0002054', # loss of function 'All missense/in frame': 'SO:0001583', # missense 'Dominant negative': 'SO:0002052', # dominant negative 'Activating': 'SO:0002053', # gain of function 'Increased gene dosage': 'SO:0001742', # copy number gain # regulatory region 'Cis-regulatory or promotor mutation': 'SO:0001566', 'Uncertain': 'SO:0001060', # generic sequence '5 or 3UTR mutation': 'SO:0001622', # UTR } type_id = so_type.get(consequence) if type_id is None: logger.warning("Consequence type unmapped: %s", str(consequence)) type_id = 'SO:0001060' # sequence variant # make the allele allele_id = ''.join((gene_id, type_id)) allele_id = re.sub(r':', '', allele_id) allele_id = '_'+allele_id # make this a BNode if self.nobnodes: allele_id = ':'+allele_id allele_label = ' '.join((consequence, 'allele in', gene_symbol)) self.gu.addIndividualToGraph(self.g, allele_id, allele_label, type_id) self.geno.addAlleleOfGene(allele_id, gene_id) return allele_id
class Provenance: """ To model provenance as the basis for an association. This encompases: * measurements taken from the lab, and their significance. these can be derived from papers or other agents. * papers >1 measurement may result from an assay, each of which may have it's own significance Status: IN PROGRESS (as observed from the incomplete identifiers TODO: add in """ prov_types = { 'measurement datum': 'IAO:0000109', 'zscore': 'STATO:0000104', 'pvalue': 'OBI:0000175', 'assay': 'OBI:0000070', 'agent': 'IAO:xxxxxxx', 'organization': 'foaf:organization', 'person': GraphUtils.PERSON, 'statistical_hypothesis_test': 'OBI:0000673' } rel_properties = { # FIXME using has_specified_output 'has_significance': 'STATO:has_significance', 'has_agent': 'RO:has_agent' # FIXME } data_property = { # FIXME should this be in RO? 'has_value': 'STATO:0000129', 'has_measurement': 'IAO:0000004' } def __init__(self, prov_type=None): if prov_type is None: self.prov_type = 'OBAN:provenance' self.prov_id = None self.measurement_datums = {} self.agent = None self.reference = None # TODO this will be papers in the future self.gu = GraphUtils(curie_map.get()) return def set_prov_id(self, prov_id): self.prov_id = prov_id return def get_prov_id(self): return self.prov_id def add_measurement_data(self, assay_id, measurement_unit, significance=None): """ Adds an object to measurement_datums like: assay_id : { # MB says each assay should be an instance of OBI Assay type: OBI:0000070 label: appmeth unit: unit id or literal? significance: { type: significance class id value: #### unit: unit id (optional) } (optional) } :param assay_id: :param measurement_value: :param measurement_unit: :param significance: :return: """ if assay_id not in self.measurement_datums: # @N @M This needs to be a list, not a set, # because multiple values may be possible for a given assay? self.measurement_datums[assay_id] = list() ms = self.get_score(self.prov_types['measurement datum'], measurement_unit) if significance is not None: ms['significance'] = significance self.measurement_datums[assay_id].append(ms) # as a convenience, we return the measurement data return ms def get_zscore(self, zscore_value): """ This will get you a significance scoring object :param zscore_value: :return: """ s = self.get_score(self.prov_types['zscore'], zscore_value) return s def get_score_id(self, score_type, score_value, score_unit=None): s = '-'.join((re.sub(r':', '', score_type), str(score_value), str(score_unit))) return s def get_score(self, score_type, score_value, score_unit=None): # TODO if score_type or score_value is None, then throw error score = { 'value': score_value, 'type': score_type } if score_unit is not None: score['unit'] = score_unit return score def add_provenance_to_graph(self, graph): # we make the assumption that the agent # has already been added to the graph, ok? self.add_measurement_data_to_graph(graph, self.measurement_datums) return def add_agent_to_graph(self, graph, agent_id, agent_label, agent_type=None, agent_description=None): if agent_type is None: agent_type = self.prov_types['agent'] self.gu.addIndividualToGraph(graph, agent_id, agent_label, agent_type, agent_description) self.agent = agent_id return def add_measurement_data_to_graph(self, graph, measurement_data): # assay_id : { # type: "measurement datum" class # value: #####, # unit: unit id or literal? # significance: { # type: significance class id # value: #### # unit: unit id (optional) # } (optional) # } # if no prov_id, then make it a random blank node if self.prov_id is None: t = datetime.now() t_string = t.strftime("%Y-%m-%d-%H-%M") self.prov_id = '_'+str(self.agent)+t_string # # TODO deal with units # for m in measurement_data: # s = measurement_data[m] # # we assume that the assay has already been added # # as an Individual # with properties elsewhere # # for i in s: # # logger.debug("\tTYPE: "+str(i.get('type'))+"\tVALUE: "+ # str(i.get('value'))+"\tUNIT: "+ # str(i.get('unit'))+" TYPE: None\tDESCRIPTION:"+ # str(i.get('description'))) # mid = self.get_score_id(i.get('type'), i.get('value'), # i.get('unit')) # # self.gu.addIndividualToGraph(graph, mid, None, # i.get('description')) # self.gu.addTriple( # graph, mid, self.data_property['has_value'], # i.get('value')) # sig = i.get('significance') # if sig is not None: # sid = self.get_score_id(i.get('type'), i.get('value'), # i.get('unit')) # self.gu.addIndividualToGraph(graph, sid, None, # s.get('type')) # self.gu.addTriple(graph, mid, # self.rel_properties['has_significance'], # sid) # self.gu.addTriple(graph, sid, # self.data_property['has_value'], # sig.get('value')) # # add the measurement as part of this provenance # self.gu.addTriple(graph, self.prov_id, # # TODO should this be "has_measurement"? # self.gu.object_properties['has_part'], mid) # self.gu.addTriple(graph, self.prov_id, # self.gu.object_properties['has_agent'], # self.agent) return def add_assay_to_graph(self, graph, assay_id, assay_label, assay_type=None, assay_description=None): if assay_type is None: assay_type = self.prov_types['assay'] self.gu.addIndividualToGraph(graph, assay_id, assay_label, assay_type, assay_description) return
def _process_kegg_disease2gene(self, limit=None): """ This method creates an association between diseases and their associated genes. We are being conservative here, and only processing those diseases for which there is no mapping to OMIM. Triples created: <alternate_locus> is an Individual <alternate_locus> has type <variant_locus> <alternate_locus> is an allele of <gene_id> <assoc_id> has subject <disease_id> <assoc_id> has object <gene_id> :param limit: :return: """ logger.info("Processing KEGG disease to gene") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) rel = gu.object_properties['is_marker_for'] gu.loadAllProperties(g) noomimset = set() raw = '/'.join((self.rawdir, self.files['disease_gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, disease_id) = row if self.testMode and gene_id not in self.test_ids['genes']: continue gene_id = 'KEGG-'+gene_id.strip() disease_id = 'KEGG-'+disease_id.strip() # only add diseases for which there is no omim id and not a grouping class if disease_id not in self.kegg_disease_hash: # add as a class disease_label = None if disease_id in self.label_hash: disease_label = self.label_hash[disease_id] if re.search('includ', str(disease_label)): # they use 'including' when it's a grouping class logger.info("Skipping this association because it's a grouping class: %s", disease_label) continue gu.addClassToGraph(g, disease_id, disease_label, 'DOID:4') # type this disease_id as a disease noomimset.add(disease_id) alt_locus_id = self._make_variant_locus_id(gene_id, disease_id) alt_label = self.label_hash[alt_locus_id] gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, gene_id) # Add the disease to gene relationship. assoc = G2PAssoc(self.name, alt_locus_id, disease_id, rel) assoc.load_all_properties(g) assoc.add_association_to_graph(g) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with KEGG disease to gene") logger.info("Found %d diseases with no omim id", len(noomimset)) return
def _process_QTLs_genomic_location(self, raw, taxon_id, build_id, build_label, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) line_counter = 0 geno = Genotype(g) genome_id = geno.makeGenomeID(taxon_id) # assume that chrs get added to the genome elsewhere eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: line_counter += 1 if re.match('^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row # Chr.Z Animal QTLdb Production_QTL 33954873 34023581 . . . # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01" # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait, # FlankMarkers,VTO_name,Map_Type,Significance,P-value,Model,Test_Base,Variance, # Bayes-value,PTO_name,gene_IDsrc,peak_cM,CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search('"FlankMarkers";', attr): attr = re.sub('"FlankMarkers";', '', attr) attr_items = re.sub('"', '', attr).split(";") bad_attr_flag = False for a in attr_items: if not re.search('=', a): bad_attr_flag = True if bad_attr_flag: logger.error("Poorly formed data on line %d:\n %s", line_counter, '\t'.join(row)) continue attribute_dict = dict(item.split("=") for item in re.sub('"', '', attr).split(";")) qtl_num = attribute_dict.get('QTL_ID') if self.testMode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait qtl_id = 'AQTL:' + str(qtl_num) gu.addIndividualToGraph(g, qtl_id, None, geno.genoparts['QTL']) geno.addTaxon(taxon_id, qtl_id) trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match('ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() p = Reference(pub_id) else: pub_id = 'PMID:' + pub_id.strip() p = Reference(pub_id, Reference.ref_types['journal_article']) p.addRefToGraph(g) # Add QTL to graph assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): score = float(re.sub('<', '', attribute_dict.get('P-value'))) assoc.set_score(score) assoc.add_association_to_graph(g) # TODO make association to breed (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub('Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_id, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(qtl_id, None, geno.genoparts['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation(start_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation(stop_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) qtl_feature.addTaxonToFeature(g, taxon_id) qtl_feature.addFeatureToGraph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genomic mappings for %s", taxon_id) return
class Environment(): """ These methods provide convenient methods to add items related to an experimental environment and it's parts to a supplied graph. This is a stub ready for expansion. """ # special genotype parts mapped to their GENO and SO classes # that we explicitly reference here environment_parts = { 'environmental_system': 'ENVO:01000254', 'environmental_condition': 'XCO:0000000', 'morpholio_reagent': 'REO:0000042', 'talen_reagent': 'REO:0001022', 'crispr_reagent': 'REO:crispr_TBD' } object_properties = { 'has_part': 'BFO:0000051', } annotation_properties = { } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties( self.graph, self.object_properties, self.gu.OBJPROP) return def addEnvironment( self, env_id, env_label, env_type=None, env_description=None): if env_type is None: env_type = self.environment_parts['environmental_system'] self.gu.addIndividualToGraph( self.graph, env_id, env_label, env_type, env_description) return def addEnvironmentalCondition( self, cond_id, cond_label, cond_type=None, cond_description=None): if cond_type is None: cond_type = self.environment_parts['environmental_condition'] self.gu.addIndividualToGraph( self.graph, cond_id, cond_label, cond_type, cond_description) return def addComponentToEnvironment(self, env_id, component_id): self.gu.addTriple( self.graph, env_id, self.gu.object_properties['has_part'], # TODO cbeck if cself component_id) return def addComponentAttributes( self, component_id, entity_id, value=None, unit=None): self.gu.addTriple( self.graph, component_id, self.gu.object_properties['has_part'], entity_id) # TODO add value and units return
class CTD(Source): """ The Comparative Toxicogenomics Database (CTD) includes curated data describing cross-species chemical–gene/protein interactions and chemical– and gene–disease associations to illuminate molecular mechanisms underlying variable susceptibility and environmentally influenced diseases. Here, we fetch, parse, and convert data from CTD into triples, leveraging only the associations based on DIRECT evidence (not using the inferred associations). We currently process the following associations: * chemical-disease * gene-pathway * gene-disease CTD curates relationships between genes and chemicals/diseases with marker/mechanism and/or therapeutic. Unfortunately, we cannot disambiguate between marker (gene expression) and mechanism (causation) for these associations. Therefore, we are left to relate these simply by "marker". CTD also pulls in genes and pathway membership from KEGG and REACTOME. We create groups of these following the pattern that the specific pathway is a subclass of 'cellular process' (a go process), and the gene is "involved in" that process. For diseases, we preferentially use OMIM identifiers when they can be used uniquely over MESH. Otherwise, we use MESH ids. Note that we scrub the following identifiers and their associated data: * REACT:116125 - generic disease class * MESH:D004283 - dog diseases * MESH:D004195 - disease models, animal * MESH:D030342 - genetic diseases, inborn * MESH:D040181 - genetic dieases, x-linked * MESH:D020022 - genetic predisposition to a disease """ files = { 'chemical_disease_interactions': { 'file': 'CTD_chemicals_diseases.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz' }, 'gene_pathway': { 'file': 'CTD_genes_pathways.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_genes_pathways.tsv.gz' }, 'gene_disease': { 'file': 'CTD_genes_diseases.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_genes_diseases.tsv.gz' } } static_files = { 'publications': {'file': 'CTD_curated_references.tsv'} } def __init__(self): Source.__init__(self, 'ctd') self.dataset = Dataset('ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() or 'gene' not in config.get_config()['test_ids']: logger.warn("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']: logger.warn("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) return def fetch(self, is_dl_forced=False): """ Override Source.fetch() Fetches resources from CTD using the CTD.files dictionary Args: :param is_dl_forced (bool): Force download Returns: :return None """ self.get_files(is_dl_forced) self._fetch_disambiguating_assoc() # consider creating subsets of the files that only have direct annotations (not inferred) return def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) self.path = Pathway(self.g, self.nobnodes) self._parse_ctd_file(limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) self.gu.loadAllProperties(self.g) # self.gu.loadProperties(self.g, self.REL_MAP, self.gu.OBJPROP) self.load_bindings() logger.info("Done parsing files.") return def _parse_ctd_file(self, limit, file): """ Parses files in CTD.files dictionary Args: :param limit (int): limit the number of rows processed :param file (str): file name (must be defined in CTD.file) Returns: :return None """ row_count = 0 version_pattern = re.compile('^# Report created: (.+)$') is_versioned = False file_path = '/'.join((self.rawdir, file)) with gzip.open(file_path, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # Scan the header lines until we get the version # There is no official version sp we are using # the upload timestamp instead if is_versioned is False: match = re.match(version_pattern, ' '.join(row)) if match: version = re.sub(r'\s|:', '-', match.group(1)) # TODO convert this timestamp to a proper timestamp self.dataset.setVersion(version) is_versioned = True elif re.match('^#', ' '.join(row)): pass else: row_count += 1 if file == self.files['chemical_disease_interactions']['file']: self._process_interactions(row) elif file == self.files['gene_pathway']['file']: self._process_pathway(row) elif file == self.files['gene_disease']['file']: self._process_disease2gene(row) if not self.testMode and limit is not None and row_count >= limit: break return def _process_pathway(self, row): """ Process row of CTD data from CTD_genes_pathways.tsv.gz and generate triples Args: :param row (list): row of CTD data Returns: :return None """ self._check_list_len(row, 4) (gene_symbol, gene_id, pathway_name, pathway_id) = row if self.testMode and (int(gene_id) not in self.test_geneids): return entrez_id = 'NCBIGene:'+gene_id if re.match('REACT:116125', pathway_id): # skipping this one, as it is generic "Disease" return # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345 if re.match('KEGG', pathway_id): pathway_id = re.sub('KEGG:', 'KEGG-path:map', pathway_id) self.gu.addClassToGraph(self.graph, entrez_id, None) # just in case, add it as a class self.path.addPathway(pathway_id, pathway_name) self.path.addGeneToPathway(pathway_id, entrez_id) return def _fetch_disambiguating_assoc(self): """ For any of the items in the chemical-disease association file that have ambiguous association types we fetch the disambiguated associations using the batch query API, and store these in a file. Elsewhere, we can loop through the file and create the appropriate associations. :return: """ disambig_file = '/'.join((self.rawdir, self.static_files['publications']['file'])) assoc_file = '/'.join((self.rawdir, self.files['chemical_disease_interactions']['file'])) # check if there is a local association file, and download if it's dated later than the original intxn file if os.path.exists(disambig_file): dfile_dt = os.stat(disambig_file) afile_dt = os.stat(assoc_file) if dfile_dt < afile_dt: logger.info("Local disambiguating file date < chem-disease assoc file. Downloading...") else: logger.info("Local disambiguating file date > chem-disease assoc file. Skipping download.") return all_pubs = set() dual_evidence = re.compile('^marker\/mechanism\|therapeutic$') # first get all the unique publications with gzip.open(assoc_file, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: if re.match('^#', ' '.join(row)): continue self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '' or not re.match(dual_evidence, direct_evidence): continue if pubmed_ids is not None and pubmed_ids != '': all_pubs.update(set(re.split('\|', pubmed_ids))) sorted_pubs = sorted(list(all_pubs)) # now in batches of 4000, we fetch the chemical-disease associations batch_size = 4000 params = { 'inputType': 'reference', 'report': 'diseases_curated', 'format': 'tsv', 'action': 'Download' } url = 'http://ctdbase.org/tools/batchQuery.go?q' start = 0 end = min((batch_size, len(all_pubs))) # get them in batches of 4000 with open(disambig_file, 'wb') as f: while start < len(sorted_pubs): params['inputTerms'] = '|'.join(sorted_pubs[start:end]) # fetch the data from url logger.info('fetching %d (%d-%d) refs: %s', len(re.split('\|', params['inputTerms'])), start, end, params['inputTerms']) data = urllib.parse.urlencode(params) encoding = 'utf-8' binary_data = data.encode(encoding) req = urllib.request.Request(url, binary_data) resp = urllib.request.urlopen(req) f.write(resp.read()) start = end end = min((start+batch_size, len(sorted_pubs))) return def _process_interactions(self, row): """ Process row of CTD data from CTD_chemicals_diseases.tsv.gz and generate triples. Only create associations based on direct evidence (not using the inferred-via-gene), and unambiguous relationships. (Ambiguous ones will be processed in the sister method using the disambiguated file). There are no OMIM ids for diseases in these cases, so we associate with only the mesh disease ids. Args: :param row (list): row of CTD data Returns: :return None """ self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '': return evidence_pattern = re.compile('^therapeutic|marker\/mechanism$') # dual_evidence = re.compile('^marker\/mechanism\|therapeutic$') # filter on those diseases that are mapped to omim ids in the test set intersect = list(set(['OMIM:'+str(i) for i in omim_ids.split('|')]+[disease_id]) & set(self.test_diseaseids)) if self.testMode and len(intersect) < 1: return chem_id = 'MESH:'+chem_id reference_list = self._process_pubmed_ids(pubmed_ids) if re.match(evidence_pattern, direct_evidence): rel_id = self._get_relationship_id(direct_evidence) self.gu.addClassToGraph(self.g, chem_id, chem_name) self.gu.addClassToGraph(self.g, disease_id, None) self._make_association(chem_id, disease_id, rel_id, reference_list) else: # there's dual evidence, but haven't mapped the pubs pass # logger.debug("Dual evidence for %s (%s) and %s (%s)", chem_name, chem_id, disease_name, disease_id) return def _process_disease2gene(self, row): """ Here, we process the disease-to-gene associations. Note that we ONLY process direct associations (not inferred through chemicals). Furthermore, we also ONLY process "marker/mechanism" associations. We preferentially utilize OMIM identifiers over MESH identifiers for disease/phenotype. Therefore, if a single OMIM id is listed under the "omim_ids" list, we will choose this over any MeSH id that might be listed as the disease_id. If multiple OMIM ids are listed in the omim_ids column, we toss this for now. (Mostly, we are not sure what to do with this information.) We associate "some variant of gene X" with the phenotype, rather than the gene directly. We also pull in the MeSH labels here (but not OMIM) to ensure that we have them (as they may not be brought in separately). :param row: :return: """ # if self.testMode: # g = self.testgraph # else: # g = self.graph # self._check_list_len(row, 9) # geno = Genotype(g) # gu = GraphUtils(curie_map.get()) (gene_symbol, gene_id, disease_name, disease_id, direct_evidence, inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row # we only want the direct associations; skipping inferred for now if direct_evidence == '' or direct_evidence != 'marker/mechanism': return # scrub some of the associations... it seems odd to link human genes to the following "diseases" diseases_to_scrub = [ 'MESH:D004283', # dog diseases 'MESH:D004195', # disease models, animal 'MESH:D030342', # genetic diseases, inborn 'MESH:D040181', # genetic dieases, x-linked 'MESH:D020022' # genetic predisposition to a disease ] if disease_id in diseases_to_scrub: logger.info("Skipping association between NCBIGene:%s and %s", str(gene_id), disease_id) return intersect = list(set(['OMIM:'+str(i) for i in omim_ids.split('|')]+[disease_id]) & set(self.test_diseaseids)) if self.testMode and (int(gene_id) not in self.test_geneids or len(intersect) < 1): return # there are three kinds of direct evidence: (marker/mechanism | marker/mechanism|therapeutic | therapeutic) # we are only using the "marker/mechanism" for now # TODO what does it mean for a gene to be therapeutic for disease? a therapeutic target? gene_id = 'NCBIGene:'+gene_id preferred_disease_id = disease_id if omim_ids is not None and omim_ids != '': omim_id_list = re.split('\|', omim_ids) # If there is only one OMIM ID for the Disease ID or in the omim_ids list, # use the OMIM ID preferentially over any MeSH ID. if re.match('OMIM:.*', disease_id): if len(omim_id_list) > 1: # the disease ID is an OMIM ID and there is more than one OMIM entry in omim_ids. # Currently no entries satisfy this condition pass elif disease_id != ('OMIM:'+omim_ids): # the disease ID is an OMIM ID and there is only one non-equiv OMIM entry in omim_ids # we preferentially use the disease_id here logger.warn("There may be alternate identifier for %s: %s", disease_id, omim_ids) # TODO: What should be done with the alternate disease IDs? else: if len(omim_id_list) == 1: # the disease ID is not an OMIM ID and there is only one OMIM entry in omim_ids. preferred_disease_id = 'OMIM:'+omim_ids elif len(omim_id_list) > 1: # This is when the disease ID is not an OMIM ID and there is more than one OMIM entry in omim_ids. pass # we actually want the association between the gene and the disease to be via an alternate locus # not the "wildtype" gene itself. # so we make an anonymous alternate locus, and put that in the association. alt_locus = '_'+gene_id+'-'+preferred_disease_id+'VL' alt_locus = re.sub(':', '', alt_locus) # can't have colons in the bnodes if self.nobnodes: alt_locus = ':'+alt_locus alt_label = 'some variant of '+gene_symbol+' that is '+direct_evidence+' for '+disease_name self.gu.addIndividualToGraph(self.g, alt_locus, alt_label, self.geno.genoparts['variant_locus']) self.gu.addClassToGraph(self.g, gene_id, None) # assume that the label gets added elsewhere self.geno.addAlleleOfGene(alt_locus, gene_id) # not sure if MESH is getting added separately. adding labels here for good measure dlabel = None if re.match('MESH', preferred_disease_id): dlabel = disease_name self.gu.addClassToGraph(self.g, preferred_disease_id, dlabel) # Add the disease to gene relationship. rel_id = self._get_relationship_id(direct_evidence) refs = self._process_pubmed_ids(pubmed_ids) self._make_association(alt_locus, preferred_disease_id, rel_id, refs) return def _make_association(self, subject_id, object_id, rel_id, pubmed_ids): """ Make a reified association given an array of pubmed identifiers. Args: :param subject_id id of the subject of the association (gene/chem) :param object_id id of the object of the association (disease) :param rel_id relationship id :param pubmed_ids an array of pubmed identifiers Returns: :return None """ # TODO pass in the relevant Assoc class rather than relying on G2P assoc = G2PAssoc(self.name, subject_id, object_id, rel_id) if pubmed_ids is not None and len(pubmed_ids) > 0: eco = self._get_evidence_code('TAS') for pmid in pubmed_ids: r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) assoc.add_source(pmid) assoc.add_evidence(eco) assoc.add_association_to_graph(self.g) assoc.load_all_properties(self.g) return @staticmethod def _process_pubmed_ids(pubmed_ids): """ Take a list of pubmed IDs and add PMID prefix Args: :param pubmed_ids - string representing publication ids seperated by a | symbol Returns: :return list: Pubmed curies """ if pubmed_ids.strip() == '': id_list = [] else: id_list = pubmed_ids.split('|') for (i, val) in enumerate(id_list): id_list[i] = 'PMID:'+val return id_list @staticmethod def _get_evidence_code(evidence): """ Get curie for evidence class label Args: :param evidence (str): evidence label Label: :return str: curie for evidence label from ECO """ eco_map = { 'TAS': 'ECO:0000033' } return eco_map[evidence] @staticmethod def _get_relationship_id(rel): """ Get curie from relationship property label Args: :param rel (str): relationship label Returns: :return str: curie for relationship label """ gu = GraphUtils(curie_map.get()) rel_map = { 'therapeutic': gu.object_properties['substance_that_treats'], 'marker/mechanism': gu.object_properties['is_marker_for'], } return str(rel_map[rel]) @staticmethod def _get_class_id(cls): """ Fet curie from CLASS_MAP dictionary Args: :param cls (str): class label Returns: :return str: curie for class label """ class_map = { 'pathway': 'PW:0000001', 'signal transduction': 'GO:0007165' } return class_map[cls] def _parse_curated_chem_disease(self, limit): line_counter = 0 file_path = '/'.join((self.rawdir, self.static_files['publications']['file'])) gu = GraphUtils(curie_map.get()) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match('^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row rel_id = self._get_relationship_id(evidence) chem_id = 'MESH:'+chem_id gu.addClassToGraph(self.g, chem_id, chem_label) gu.addClassToGraph(self.g, disease_id, None) if pub_id != '': pub_id = 'PMID:'+pub_id r = Reference(pub_id, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) else: pub_id = None self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id]) if not self.testMode and limit is not None and line_counter >= limit: break return def getTestSuite(self): import unittest from tests.test_ctd import CTDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(CTDTestCase) # test_suite.addTests(unittest.TestLoader().loadTestsFromTestCase(InteractionsTestCase)) return test_suite
class Pathway(): """ This provides convenience methods to deal with gene and protein collections in the context of pathways. """ pathway_parts = { 'signal_transduction': 'GO:0007165', 'cellular_process': 'GO:0009987', 'pathway': 'PW:0000001', 'gene_product': 'CHEBI:33695' # bioinformation molecule } object_properties = { 'involved_in': 'RO:0002331', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205' } properties = object_properties.copy() def __init__(self, graph, nobnodes=False): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.nobnodes = nobnodes self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return def addPathway( self, pathway_id, pathway_label, pathway_type=None, pathway_description=None): """ Adds a pathway as a class. If no specific type is specified, it will default to a subclass of "GO:cellular_process" and "PW:pathway". :param pathway_id: :param pathway_label: :param pathway_type: :param pathway_description: :return: """ if pathway_type is None: pathway_type = self.pathway_parts['cellular_process'] self.gu.addClassToGraph( self.graph, pathway_id, pathway_label, pathway_type, pathway_description) self.gu.addSubclass( self.graph, self.pathway_parts['pathway'], pathway_id) return def addGeneToPathway(self, pathway_id, gene_id): """ When adding a gene to a pathway, we create an intermediate 'gene product' that is involved in the pathway, through a blank node. gene_id RO:has_gene_product _gene_product _gene_product RO:involved_in pathway_id :param pathway_id: :param gene_id: :return: """ gene_product = '_'+re.sub(r':', '', gene_id)+'product' if self.nobnodes: gene_product = ':'+gene_product self.gu.addIndividualToGraph( self.graph, gene_product, None, self.pathway_parts['gene_product']) self.gu.addTriple( self.graph, gene_id, self.object_properties['has_gene_product'], gene_product) self.addComponentToPathway(pathway_id, gene_product) return def addComponentToPathway(self, pathway_id, component_id): """ This can be used directly when the component is directly involved in the pathway. If a transforming event is performed on the component first, then the addGeneToPathway should be used instead. :param pathway_id: :param component_id: :return: """ self.gu.addTriple(self.graph, component_id, self.object_properties['involved_in'], pathway_id) return
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) assoc_counter = 0 with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') # ## set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': gu.addClassToGraph(g, gene_id, None) else: gu.addIndividualToGraph(g, gene_id, None) # add the publication as a NamedIndividual # add type publication gu.addIndividualToGraph(g, pubmed_id, None, None) r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.object_properties['is_about'], gene_id) assoc_counter += 1 if not self.testMode and \ limit is not None and line_counter > limit: break logger.info( "Processed %d pub-gene associations", assoc_counter) return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') # set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': gu.addClassToGraph(g, gene_id, None) gu.addClassToGraph( g, discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id]) else: gu.addIndividualToGraph(g, gene_id, None) gu.addIndividualToGraph( g, discontinued_gene_id, discontinued_symbol) gu.addDeprecatedIndividual( g, discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene gu.addSynonym(g, gene_id, discontinued_symbol) if (not self.testMode) and\ (limit is not None and line_counter > limit): break return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere gu.addClassToGraph(g, tax_id, None) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: gu.addIndividualToGraph( g, gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym( g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym( g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match(r'HPRD', fixedr): # proteins are not == genes. gu.addTriple( g, gene_id, self.properties[ 'has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in [ 'Vega', 'IMGT/GENE-DB']: if self.class_or_indiv.get(gene_id) == 'C': gu.addEquivalentClass( g, gene_id, fixedr) else: gu.addSameIndividual( g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match( r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^'+c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(maploc_id, None, None) band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple( g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug( 'not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple( g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids): continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts gu.addClassToGraph(g, mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class gu.addIndividualToGraph( g, strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? gu.makeLeader(g, strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology gu.addClassToGraph(g, pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(self.name, mgi_allele_id, pid, gu.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph(g) else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_'+gene+'-VL' vl_id = re.sub(r':', '', vl_id) if self.nobnodes: vl_id = ':'+vl_id vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = '_'+re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r':', '', gvc_id) if self.nobnodes: gvc_id = ':'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ '_' + re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) if self.nobnodes: bkgd_id = ':'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) gu.addTriple( g, s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass gu.loadProperties( g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadProperties( g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadAllProperties(g) logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing genetic location for %s", taxon_id) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = 'AQTL:'+qtl_id trait_id = 'AQTLTrait:'+trait_id # Add QTL to graph f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL']) f.addTaxonToFeature(g, taxon_id) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_id, 'CHR') # add a version of the chromosome which is defined as the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_id) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) start = stop = None if re.search('-', range_cm): range_parts = re.split('-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '': (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)] else: logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop when schema can handle floats # add in the genetic location based on the range f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureToGraph(g) # sometimes there's a peak marker, like a rsid. we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration']) gu.addXref(g, qtl_id, dbsnp_id) if gene_id is not None and gene_id != '' and gene_id != '.': if gene_id_src == 'NCBIgene' or gene_id_src == '': # we assume if no src is provided, it's NCBI gene_id = 'NCBIGene:'+gene_id.strip() geno.addGene(gene_id, None) # we will expect that these labels provided elsewhere geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation']) # FIXME what is the right relationship here? if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark if self.nobnodes: vl_id = ':' + vl_id geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) # add the trait gu.addClassToGraph(g, trait_id, trait_name) # Add publication r = None if re.match('ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() r = Reference(pub_id) elif pubmed_id != '': pub_id = 'PMID:'+pubmed_id.strip() r = Reference(pub_id, Reference.ref_types['journal_article']) if r is not None: r.addRefToGraph(g) # make the association to the QTL assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genetic info") return
class OMIA(Source): """ This is the parser for the [Online Mendelian Inheritance in Animals (OMIA)](http://www.http://omia.angis.org.au), from which we process inherited disorders, other (single-locus) traits, and genes in >200 animal species (other than human and mouse and rats). We generate the omia graph to include the following information: * genes * animal taxonomy, and breeds as instances of those taxa (breeds are akin to "strains" in other taxa) * animal diseases, along with species-specific subtypes of those diseases * publications (and their mapping to PMIDs, if available) * gene-to-phenotype associations (via an anonymous variant-locus * breed-to-phenotype associations We make links between OMIA and OMIM in two ways: 1. mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM 2. mappings between a breed and OMIA disease are created to be a model for the mapped OMIM disease, IF AND ONLY IF it is a 1:1 mapping. there are some 1:many mappings, and these often happen if the OMIM item is a gene. Because many of these species are not covered in the PANTHER orthology datafiles, we also pull any orthology relationships from the gene_group files from NCBI. """ files = { 'data': { 'file': 'omia.xml.gz', 'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'}, } def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: """ self.get_files(is_dl_forced) ncbi = NCBIGene() # ncbi.fetch() gene_group = ncbi.files['gene_group'] self.fetch_from_url( gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])), False) return def parse(self, limit=None): # names of tables to iterate - probably don't need all these: # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword, # Article_People, Article_Phene, Articles, Breed, Breed_Phene, # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords, # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People, # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms self.scrub() if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # we do three passes through the file # first process species (two others reference this one) self.process_species(limit) # then, process the breeds, genes, articles, and other static stuff self.process_classes(limit) # next process the association data self.process_associations(limit) # process the vertebrate orthology for genes # that are annotated with phenotypes ncbi = NCBIGene() ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes) self.load_core_bindings() self.load_bindings() logger.info("Done parsing.") self.write_molgen_report() return def scrub(self): """ The XML file seems to have mixed-encoding; we scrub out the control characters from the file for processing. :return: """ logger.info( "Scrubbing out the nasty characters that break our parser.") myfile = '/'.join((self.rawdir, self.files['data']['file'])) tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz')) t = gzip.open(tmpfile, 'wb') du = DipperUtil() with gzip.open(myfile, 'rb') as f: filereader = io.TextIOWrapper(f, newline="") for l in filereader: l = du.remove_control_characters(l) + '\n' t.write(l.encode('utf-8')) t.close() # move the temp file logger.info("Replacing the original data with the scrubbed file.") shutil.move(tmpfile, myfile) return # ###################### XML LOOPING FUNCTIONS ################## def process_species(self, limit): """ Loop through the xml file and process the species. We add elements to the graph, and store the id-to-label in the label_hash dict. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # Species ids are == genbank species ids! self.process_xml_table( elem, 'Species_gb', self._process_species_table_row, limit) f.close() return def process_classes(self, limit): """ Loop through the xml file and process the articles, breed, genes, phenes, and phenotype-grouping classes. We add elements to the graph, and store the id-to-label in the label_hash dict, along with the internal key-to-external id in the id_hash dict. The latter are referenced in the association processing functions. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line parser = ET.XMLParser(encoding='utf-8') for event, elem in ET.iterparse(filereader, parser=parser): self.process_xml_table( elem, 'Articles', self._process_article_row, limit) self.process_xml_table( elem, 'Breed', self._process_breed_row, limit) self.process_xml_table( elem, 'Genes_gb', self._process_gene_row, limit) self.process_xml_table( elem, 'OMIA_Group', self._process_omia_group_row, limit) self.process_xml_table( elem, 'Phene', self._process_phene_row, limit) self.process_xml_table( elem, 'Omim_Xref', self._process_omia_omim_map, limit) f.close() # post-process the omia-omim associations to filter out the genes # (keep only phenotypes/diseases) self.clean_up_omim_genes() return def process_associations(self, limit): """ Loop through the xml file and process the article-breed, article-phene, breed-phene, phene-gene associations, and the external links to LIDA. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): self.process_xml_table( elem, 'Article_Breed', self._process_article_breed_row, limit) self.process_xml_table( elem, 'Article_Phene', self._process_article_phene_row, limit) self.process_xml_table( elem, 'Breed_Phene', self._process_breed_phene_row, limit) self.process_xml_table( elem, 'Lida_Links', self._process_lida_links_row, limit) self.process_xml_table( elem, 'Phene_Gene', self._process_phene_gene_row, limit) self.process_xml_table( elem, 'Group_MPO', self._process_group_mpo_row, limit) f.close() return # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################ def _process_species_table_row(self, row): # gb_species_id, sci_name, com_name, added_by, date_modified tax_id = 'NCBITaxon:'+str(row['gb_species_id']) sci_name = row['sci_name'] com_name = row['com_name'] if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return self.gu.addClassToGraph(self.g, tax_id, sci_name) if com_name != '': self.gu.addSynonym(self.g, tax_id, com_name) self.label_hash[tax_id] = com_name # for lookup later else: self.label_hash[tax_id] = sci_name return def _process_breed_row(self, row): # in test mode, keep all breeds of our test species if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return # save the breed keys in the test_ids for later processing self.test_ids['breed'] += [int(row['breed_id'])] breed_id = self.make_breed_id(row['breed_id']) self.id_hash['breed'][row['breed_id']] = breed_id tax_id = 'NCBITaxon:'+str(row['gb_species_id']) breed_label = row['breed_name'] species_label = self.label_hash.get(tax_id) if species_label is not None: breed_label = breed_label + ' ('+species_label+')' self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id) self.label_hash[breed_id] = breed_label return def _process_phene_row(self, row): phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) self.gu.addClassToGraph( self.g, sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': self.gu.addDescription( self.g, sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) self.gu.addOWLPropertyClassRestriction( self.g, sp_phene_id, self.gu.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph(self.g) if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return def write_molgen_report(self): import csv logger.info("Writing G2P report for OMIA") f = '/'.join((self.outdir, 'omia_molgen_report.txt')) with open(f, 'w', newline='\n') as csvfile: writer = csv.writer(csvfile, delimiter='\t') # write header h = ['omia_id', 'molecular_description', 'mapping_info', 'species'] writer.writerow(h) for phene in self.stored_omia_mol_gen: writer.writerow((str(phene), self.stored_omia_mol_gen[phene]['mol_gen'], self.stored_omia_mol_gen[phene]['map_info'], self.stored_omia_mol_gen[phene]['species'])) logger.info( "Wrote %d potential G2P descriptions for curation to %s", len(self.stored_omia_mol_gen), f) return def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return def _process_omia_group_row(self, row): omia_id = 'OMIA:'+row['omia_id'] if self.testMode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] disease_id = None group_category = row.get('group_category') disease_id = \ self.map_omia_group_category_to_ontology_id(group_category) if disease_id is not None: self.gu.addClassToGraph(self.g, disease_id, None) if disease_id == 'MP:0008762': # embryonic lethal # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.name, omia_id, disease_id) assoc.add_association_to_graph(self.g) disease_id = None else: logger.info( "No disease superclass defined for %s: %s", omia_id, group_name) # default to general disease FIXME this may not be desired disease_id = 'DOID:4' if group_summary == '': group_summary = None if group_name == '': group_name = None self.gu.addClassToGraph( self.g, omia_id, group_name, disease_id, group_summary) self.label_hash[omia_id] = group_name return def _process_gene_row(self, row): if self.testMode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:'+str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:'+str(row['gb_species_id']) gene_type_id = NCBIGene.map_type_of_gene(row['gene_type']) self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id) self.geno.addTaxon(tax_id, gene_id) return def _process_article_breed_row(self, row): # article_id, breed_id, added_by # don't bother putting these into the test... too many! # and int(row['breed_id']) not in self.test_ids['breed']: if self.testMode: return article_id = self.id_hash['article'].get(row['article_id']) breed_id = self.id_hash['breed'].get(row['breed_id']) # there's some missing data (article=6038). in that case skip if article_id is not None: self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], breed_id) else: logger.warning("Missing article key %s", str(row['article_id'])) return def _process_article_phene_row(self, row): """ Linking articles to species-specific phenes. :param row: :return: """ # article_id, phene_id, added_by # look up the article in the hashmap phenotype_id = self.id_hash['phene'].get(row['phene_id']) article_id = self.id_hash['article'].get(row['article_id']) omia_id = self._get_omia_id_from_phene_id(phenotype_id) if self.testMode and omia_id not in self.test_ids['disease'] \ or phenotype_id is None or article_id is None: return # make a triple, where the article is about the phenotype self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], phenotype_id) return def _process_breed_phene_row(self, row): # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if (self.testMode and not ( omia_id in self.test_ids['disease'] and int(row['breed_id']) in self.test_ids['breed']) or breed_id is None or phene_id is None): return # FIXME we want a different relationship here assoc = G2PAssoc( self.name, breed_id, phene_id, self.gu.object_properties['has_phenotype']) assoc.add_association_to_graph(self.g) # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = "ECO:0000214" # biological aspect of descendant evidence if omim_ids is not None and len(omim_ids) > 0: if len(omim_ids) > 1: logger.info( "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) for i in omim_ids: assoc = G2PAssoc( self.name, breed_id, i, self.gu.object_properties['model_of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph(self.g) aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: breed_label = "this breed" m = re.search(r'\((.*)\)', breed_label) if m: sp_label = m.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in '+sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", i + ".")) self.gu.addDescription(self.g, aid, desc) return def _process_lida_links_row(self, row): # lidaurl, omia_id, added_by omia_id = 'OMIA:'+row['omia_id'] lidaurl = row['lidaurl'] if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, lidaurl, True) return def _process_phene_gene_row(self, row): gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' if self.nobnodes: vl = ':'+vl self.geno.addAllele(vl, 'some variant of ' + gene_label) self.geno.addAlleleOfGene(vl, gene_id) assoc = G2PAssoc(self.name, vl, phene_id) assoc.add_association_to_graph(self.g) # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return def _process_omia_omim_map(self, row): """ Links OMIA groups to OMIM equivalents. :param row: :return: """ # omia_id, omim_id, added_by omia_id = 'OMIA:'+row['omia_id'] omim_id = 'OMIM:'+row['omim_id'] # also store this for use when we say that a given animal is # a model of a disease if omia_id not in self.omia_omim_map: self.omia_omim_map[omia_id] = set() self.omia_omim_map[omia_id].add(omim_id) if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, omim_id) return def map_omia_group_category_to_ontology_id(self, category_num): """ Using the category number in the OMIA_groups table, map them to a disease id. This may be superceeded by other MONDO methods. Platelet disorders will be more specific once https://github.com/obophenotype/human-disease-ontology/issues/46 is fulfilled. :param category_num: :return: """ category_map = { 1: 'DOID:0014667', # Inborn error of metabolism 2: 'MESH:D004392', # Dwarfism 3: 'DOID:1682', # congenital heart disease 4: 'DOID:74', # blood system disease 5: 'DOID:3211', # lysosomal storage disease 6: 'DOID:16', # integumentary system disease # --> retinal degeneration ==> OMIA:000830 7: 'DOID:8466', # progressive retinal atrophy 8: 'DOID:0050572', # Cone–rod dystrophy 9: 'MESH:C536122', # stationary night blindness 10: 'Orphanet:98553', # developmental retinal disorder 11: 'DOID:5679', # retinal disorder 12: 'Orphanet:90771', # Disorder of Sex Development # - what to do about this one? 13: 'MP:0008762', # embryonic lethal # - not sure what to do with this 14: None, # blood group # FIXME make me more specific 15: 'DOID:2218', # intrinsic platelet disorder # FIXME make me more specific 16: 'DOID:2218', # extrinsic platelet disorder 17: None # transgenic ??? } disease_id = None if category_num is not None and int(category_num) in category_map: disease_id = category_map.get(int(category_num)) logger.info( "Found %s for category %s", str(disease_id), str(category_num)) else: logger.info( "There's a group category I don't know anything about: %s", str(category_num)) return disease_id def _process_group_mpo_row(self, row): """ Make OMIA to MP associations :param row: :return: """ omia_id = 'OMIA:'+row['omia_id'] mpo_num = int(row['MPO_no']) mpo_id = 'MP:'+str(mpo_num).zfill(7) assoc = D2PAssoc(self.name, omia_id, mpo_id) assoc.add_association_to_graph(self.g) return def clean_up_omim_genes(self): omim = OMIM() # get all the omim ids allomimids = set() for omia in self.omia_omim_map: allomimids.update(self.omia_omim_map[omia]) entries_that_are_phenotypes = omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None) logger.info( "Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) # now iterate again and remove those non-phenotype ids removed_count = 0 for omia in self.omia_omim_map: ids = self.omia_omim_map[omia] cleanids = set() for i in ids: if i in entries_that_are_phenotypes: cleanids.add(i) else: removed_count += 1 # keep track of how many we've removed self.omia_omim_map[omia] = cleanids logger.info( "Removed %d omim ids from the omia-to-omim map", removed_count) return def _make_internal_id(self, prefix, key): iid = '_'+''.join(('omia', prefix, 'key', str(key))) if self.nobnodes: iid = ':'+iid return iid def make_breed_id(self, key): breed_id = 'OMIA-breed:'+str(key) return breed_id @staticmethod def _get_omia_id_from_phene_id(phene_id): omia_id = None if phene_id is not None: m = re.match(r'OMIA:\d+', str(phene_id)) if m: omia_id = m.group(0) return omia_id @staticmethod def _map_inheritance_term_id(inheritance_symbol): inherit_map = { 'A': None, # Autosomal 'ACD': 'GENO:0000143', # Autosomal co-dominant 'ADV': None, # autosomal dominant with variable expressivity 'AID': 'GENO:0000259', # autosomal incompletely dominant 'ASD': 'GENO:0000145', # autosomal semi-dominant # autosomal recessive, semi-lethal # using generic autosomal recessive 'ASL': 'GENO:0000150', 'D': 'GENO:0000147', # autosomal dominant 'M': None, # multifactorial 'MAT': None, # Maternal # probably autosomal recessive # using generic autosomal recessive 'PR': 'GENO:0000150', 'R': 'GENO:0000150', # Autosomal Recessive # Recessive Embryonic Lethal # using plain recessive 'REL': 'GENO:0000148', # Autosomal Recessive Lethal # using plain autosomal recessive 'RL': 'GENO:0000150', 'S': 'GENO:0000146', # Sex-linked <--using allosomal dominant 'SLi': None, # Sex-limited 'UD': 'GENO:0000144', # Dominant 'X': None, # x-linked # HP:0001417 ? # X-linked Dominant <-- temp using allosomal dominant FIXME 'XLD': 'GENO:0000146', # X-linked Recessive <-- temp using allosomal recessive FIXME 'XLR': 'GENO:0000149', 'Y': None, # Y-linked 'Z': None, # Z-linked # Z-linked recessive <-- temp using allosomal recessive FIXME 'ZR': 'GENO:0000149', '999': None, # Z-linked incompletely dominant } inheritance_id = inherit_map.get(inheritance_symbol) if inheritance_id is None and inheritance_symbol is not None: logger.warning( "No inheritance id is mapped for %s", inheritance_symbol) return inheritance_id def getTestSuite(self): import unittest from tests.test_omia import OMIATestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase) return test_suite
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) gu.loadAllProperties(g) f = Feature(None, None, None) f.loadAllProperties(g) gu.loadAllProperties(g) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' gu.addClassToGraph(g, tax_id, None) geno.addGenome(tax_id, None) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 28 if num_cols != expected_numcols: logger.error("Unexpected number of columns in raw file (%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories) = line.split('\t') # #### set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub('^[;,]', '', phenotype_ids) phenotype_ids = re.sub('[;,]$', '', phenotype_ids) pheno_list = re.split('[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids and these phenotype_ids intersect = list(set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and int(variant_num) not in self.variant_ids \ and len(intersect) < 1: continue # TODO may need to switch on assembly to create correct assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the approximate location # strangely, they still put an assembly number even when there's no numeric location if not re.search('-',str(cytogenetic_loc)): band_id = makeChromID(re.split('-',str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID(re.split('-',str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None if str(gene_num) != '-1' and str(gene_num) != 'more than 10': # they use -1 to indicate unknown gene gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing the xml # for example, variant_num = 38562 # but there's no way to tell if it's a haplotype in the csv data # so the dbsnp or dbvar should probably be primary, and the variant num be the vslc, # with each of the dbsnps being added to it # todo clinical significance needs to be mapped to a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph(g) if bandinbuild_id is not None: f.addSubsequenceOfFeature(g, bandinbuild_id) # CHECK - this makes the assumption that there is only one affected chromosome per variant # what happens with chromosomal rearrangement variants? shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': gu.addSynonym(g, seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': gu.addSynonym(g, seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) gu.addIndividualToGraph(g, dbsnp_id, None) gu.addSameIndividual(g, seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num gu.addIndividualToGraph(g, dbvar_id, None) gu.addSameIndividual(g, seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(';',rcv_nums): rcv_id = 'ClinVar:'+rcv_num gu.addIndividualToGraph(g, rcv_id, None) gu.addXref(g, seqalt_id, rcv_id) if gene_id is not None: # add the gene gu.addClassToGraph(g, gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search('\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info("Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match('more than 10', gene_symbol): logger.info("More than 10 genes found; need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, but i don't know why! # some are bad, like: Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for p in pheno_list: m = re.match("(Orphanet:ORPHA(?:\s*ORPHA)?)", p) if m is not None and len(m.groups()) > 0: p = re.sub(m.group(1), 'Orphanet:', p.strip()) elif re.match('SNOMED CT', p): p = re.sub('SNOMED CT', 'SNOMED', p.strip()) assoc = G2PAssoc(self.name, seqalt_id, p.strip()) assoc.add_association_to_graph(g) if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" # ex: CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] gu.addIndividualToGraph(g, xrefid, None) gu.addSameIndividual(g, seqalt_id, xrefid) elif prefix == 'HGMD': gu.addIndividualToGraph(g, xrefid, None) gu.addSameIndividual(g, seqalt_id, xrefid) elif prefix == 'dbVar' and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search('\s', prefix): pass # logger.debug('xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) logger.info("Finished parsing variants") return
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) line_counter = 0 gu.loadAllProperties(g) gu.loadObjectProperties(g, geno.object_properties) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus gu.addClassToGraph(g, taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_'+re.sub(r'\W+', '_', colony) if self.nobnodes: colony_id = ':'+colony_id if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_IMPC-'+re.sub(r':', '', allele_accession_id) if self.nobnodes: allele_accession_id = ':'+allele_accession_id if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_'+strain_accession_id if self.nobnodes: strain_accession_id = ':'+strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning( "Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_seqalt'+re.sub(r':', '', allele_accession_id) if self.nobnodes: sequence_alteration_id = ':'+sequence_alteration_id geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_'+allele_accession_id+geno.zygosity['indeterminate'] vslc_colony = re.sub(r':', '', vslc_colony) if self.nobnodes: vslc_colony = ':'+vslc_colony vslc_colony_label = allele_symbol+'/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) gu.addTriple( g, colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '_' + '-'.join((marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':'+vslc_id gu.addIndividualToGraph( g, vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc gu.addType( g, vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '/' + phenotyping_center pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_'+pheno_center_strain_id if self.nobnodes: pheno_center_strain_id = ':'+pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(pheno_center_strain_id, taxon_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) genotype_name += '['+colony+']' geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name+' ('+sex+')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # experimental_phenotypic_evidence This was used in ZFIN eco_id = "ECO:0000059" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # add a free-text description description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) gu.addDescription(g, assoc_id, description) # TODO add provenance information # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) return
def _add_variant_cdna_variant_assoc_to_graph(self, row): """ Generates relationships between variants and cDNA variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure. Only applicable for structure 2. :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = row variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) # Add gene self._add_variant_gene_relationship(variant_id, variant_gene) # Transcript reference for nucleotide position transcript_curie = self._make_transcript_curie(transcript_id) # Make region IDs cdna_region_id = ":_{0}Region".format(transcript_curie) chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) # Add the genome build genome_label = "Human" build_id = "UCSC:{0}".format(genome_build) taxon_id = 'NCBITaxon:9606' geno.addGenome(taxon_id, genome_label) geno.addReferenceGenome(build_id, genome_build, taxon_id) # Add chromosome chrom_class_id = makeChromID(chromosome, '9606', 'CHR') # the chrom class (generic) id chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chromosome, taxon_id, 'Human') # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id) # Add variant coordinates in reference to chromosome self._add_feature_with_coords(variant_id,genome_pos_start, genome_pos_end, chrom_instance_id, chrom_region_id) # Add mutation coordinates in reference to gene self._add_feature_with_coords(variant_id, bp_pos, bp_pos, transcript_curie, cdna_region_id) # Add nucleotide mutation gu.addTriple(self.graph, variant_id, geno.properties['reference_nucleotide'], ref_base, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['altered_nucleotide'], variant_base, is_literal) """ Here we update any internal cgd variant IDS with a cosmic ID or dbSNP ID. Alternatively we could do this using sql rather than a sparql update which may be safer """ # Add SNP xrefs if cosmic_id is not None: cosmic_id_list = cosmic_id.split(', ') cosmic_curie_list = [] for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) cosmic_curie_list.append(cosmic_curie) gu.addIndividualToGraph(self.graph, cosmic_curie, c_id, geno.genoparts['missense_variant']) # If there are multiple ids set them equivalent to the first for curie in cosmic_curie_list[1:]: gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie) self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings) if db_snp_id is not None: db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id) gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id, geno.genoparts['missense_variant']) if cosmic_id is None: self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings) else: cosmic_id_list = cosmic_id.split(', ') for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie) return
def add_disease_drug_variant_to_graph(self, table): """ Takes an iterable of iterables as input with the following structure, optional indices can be Null: [[variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug, therapy_status (optional), pubmed_id(optional)]] See ongoing discussion of how to best model here: https://github.com/monarch-initiative/mckb/issues/9 :param table: iterable of iterables, for example, a tuple of tuples from _get_disease_drug_variant_relationship :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) for row in table: (variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug_label, therapy_status, pubmed_id) = row if specific_diagnosis is not None: diagnoses_label = specific_diagnosis else: diagnoses_label = diagnoses # Arbitrary IDs to be replaced by ontology mappings variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) disease_id = self._get_disease_id(diagnoses_key, diagnoses_label) therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status)) relationship_id = "RO:has_environment" disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_") has_quality_property = "BFO:0000159" drug_id = self._get_drug_id(drug_key, drug_label) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) disease_instance_id = self.make_cgd_id('disease{0}{1}'.format( diagnoses_label, variant_key)) phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format( diagnoses_label, variant_key, relationship)) phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship) if relationship == "detrimental effect": phenotype_instance_label = "{0} with therapeutic response {1} to health"\ .format(diagnoses_label, relationship) # Reified association for disease caused_by genotype variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label)) # Add individuals/classes gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4') gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888') gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label, disease_id) gu.loadObjectProperties(self.graph, {relationship: relationship_id}) if pubmed_id is not None: source_id = "PMID:{0}".format(pubmed_id) ref = Reference(source_id, Reference.ref_types['journal_article']) ref.addRefToGraph(self.graph) evidence = 'ECO:0000033' else: source_id = None evidence = None rel_id = gu.object_properties['has_phenotype'] variant_phenotype_assoc = G2PAssoc(self.name, variant_id, phenotype_instance_id, rel_id) variant_phenotype_assoc.set_association_id(variant_disease_annot) if evidence: variant_phenotype_assoc.add_evidence(evidence) if source_id: variant_phenotype_assoc.add_source(source_id) variant_phenotype_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id) gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality) # Add therapy-disease association and approval status marker_relation = "RO:has_biomarker" disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label) gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label, disease_id) gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id) gu.addClassToGraph(self.graph, therapy_status_id, therapy_status) self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id) return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"])) for event, elem in ET.iterparse(myfile): if elem.tag == "Disorder": # get the element name and id # id = elem.get('id') # some internal identifier disorder_num = elem.find("OrphaNumber").text disorder_id = "Orphanet:" + str(disorder_num) if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]: continue disorder_label = elem.find("Name").text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find("GeneList") for gene in gene_list.findall("Gene"): gene_iid = gene.get("id") gene_type = gene.find("GeneType").get("id") gene_iid_to_type[gene_iid] = gene_type gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology assoc_list = elem.find("DisorderGeneAssociationList") for a in assoc_list.findall("DisorderGeneAssociation"): gene_iid = a.find(".//Gene").get("id") gene_name = a.find(".//Gene/Name").text gene_symbol = a.find(".//Gene/Symbol").text gene_num = a.find("./Gene/OrphaNumber").text gene_id = "Orphanet:" + str(gene_num) gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find("./Gene/SynonymList") if int(syn_list.get("count")) > 0: for s in syn_list.findall("./Synonym"): gu.addSynonym(g, gene_id, s.text) dgtype = a.find("DisorderGeneAssociationType").get("id") rel_id = self._map_rel_id(dgtype) dg_label = a.find("./DisorderGeneAssociationType/Name").text if rel_id is None: logger.warn( "Cannot map association type (%s) to RO for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol, ) continue alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL" alt_label = " ".join( ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label) ) if self.nobnodes: alt_locus_id = ":" + alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"]) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = a.find("DisorderGeneAssociationStatus").get("id") eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion if status_code == "17991": # Assessed # TODO are these internal ids stable between releases? eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find("./Gene/ExternalReferenceList") eqid = None for r in rlist.findall("ExternalReference"): if r.find("Source").text == "Ensembl": eqid = "ENSEMBL:" + r.find("Reference").text elif r.find("Source").text == "HGNC": eqid = "HGNC:" + r.find("Reference").text elif r.find("Source").text == "OMIM": eqid = "OMIM:" + r.find("Reference").text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) pass elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return
def _process_morbidmap(self, limit): """ This will process the morbidmap file to get the links between omim genes and diseases. Here, we create anonymous nodes for some variant loci that are variants of the gene that causes the disease. Triples created: <some_anonymous_variant_locus> is_sequence_variant_instance_of <omim_gene_id> <some_anonymous_variant_locus> has_phenotype <omim_disease_id> <assoc> hasSubject <some_anonymous_variant_locus> <assoc> hasObject <omim_disease_id> <assoc> hasPredicate <has_phenotype> <assoc> DC:evidence <eco_id> :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) with open('/'.join((self.rawdir, self.files['morbidmap']['file']))) as f: for line in f: line = line.strip() line_counter += 1 (disorder, gene_symbols, gene_num, loc) = line.split('|') # disorder = disorder label , number (mapping key) # 3-M syndrome 1, 273750 (3)|CUL7, 3M1|609577|6p21.1 # but note that for those diseases where they are genomic loci (not genes though), # the omim id is only listed as the gene # Alopecia areata 1 (2)|AA1|104000|18p11.3-p11.2 disorder_match = re.match('(.*), (\d+) \((\d+)\)', disorder) if disorder_match is not None: disorder_parts = disorder_match.groups() if len(disorder_parts) == 3: (disorder_label, disorder_num, phene_key) = disorder_parts else: logger.warn("I couldn't parse disorder string: %s", disorder) continue if self.testMode and (int(disorder_num) not in self.test_ids or int(gene_num) not in self.test_ids): continue gene_symbols = gene_symbols.split(', ') gene_id = ':'.join(('OMIM', gene_num)) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = gu.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match('\[', disorder_label): rel_id = gu.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match('\{', disorder_label): rel_id = gu.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease to be via an alternate locus # not the "wildtype" gene itself. # so we make an anonymous alternate locus, and put that in the association. alt_locus = '_'+gene_num+'-'+disorder_num+'VL' alt_label = gene_symbols[0].strip() if alt_label is not None and alt_label != '': alt_label = ' '.join(('some variant of', alt_label.strip(), 'that', rel_label, disorder_label)) else: alt_label = None gu.addIndividualToGraph(g, alt_locus, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus, gene_id) assoc = G2PAssoc(self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.load_all_properties(g) assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, geno.object_properties, gu.OBJPROP) return