def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph is_about = gu.getNode(gu.object_properties['is_about']) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) # add the gene, in case it hasn't before gu.addClassToGraph(g, gene_id, None) # add the publication as a NamedIndividual gu.addIndividualToGraph(g, pubmed_id, None, None) # add type publication self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id))) if not self.testMode and limit is not None and line_counter > limit: break return
def addRefToGraph(self, g): gu = GraphUtils(curie_map.get()) n = self.short_citation if n is None: n = self.title if self.ref_url is not None: ref_uri = URIRef(self.ref_url) g.add((ref_uri, DC['title'], Literal(self.title))) g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type))) g.add((ref_uri, RDFS['label'], Literal(n))) elif self.ref_id is not None: gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type) if self.title is not None: gu.addTitle(g, self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils from dipper import curie_map # Make testutils object and load ttl test_query = TestUtils(self.source.graph) test_query.load_testgraph_from_turtle(self.source) # Expected structure # TODO can this be unified OBAN and the Annot models to be automatically generated? sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check gu = GraphUtils(curie_map.get()) chem_id = 'MESH:D009538' chem_uri = gu.getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = gu.getNode(disease_id) eco = 'ECO:0000033' rel_id = gu.object_properties['substance_that_treats'] pubmed_id = 'PMID:16785264' pubmed_uri = gu.getNode(pubmed_id) # consider replacing with make_ctd_chem_disease_assoc_id() assoc_id = self.source.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = gu.getNode(assoc_id) # One of the expected outputs from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output, "did not find expected association: " + assoc_id + " found: " + pprint.pformat(sparql_output)) logger.info("Test query data finished.")
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils # Make testutils object and load bindings test_query = TestUtils(self.ctd.graph) self.ctd.load_bindings() # Expected structure sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check gu = GraphUtils(curie_map.get()) chem_id = 'MESH:D009538' chem_uri = gu.getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = gu.getNode(disease_id) pubmed_id = 'PMID:16785264' pubmed_uri = gu.getNode(pubmed_id) rel_id = gu.object_properties['substance_that_treats'] eco = 'ECO:0000033' # TODO PYLINT make_association_id() does not exist in CTD # there is "_make_association()" with a different sig assoc_id = self.ctd.make_association_id( 'ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = gu.getNode(assoc_id) # Expected output from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output) logger.info("Test finished.")
def test_curieprefixes(self): """ This will ensure that we can create identifiers for all of the defined curie prefixes using the GraphUtils.getNode() method :return: """ from dipper.utils.GraphUtils import GraphUtils gu = GraphUtils(self.curie_map) # add one id per curie as classes to the graph for p in self.curie_map.keys(): testid = p+':testme' n = gu.getNode(testid) m = "prefix \""+p+"\" has an error...can't create graph node" self.assertTrue(n is not None, m) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ object_properties = { 'location': 'faldo:location', 'begin': 'faldo:begin', 'end': 'faldo:end', 'reference': 'faldo:reference', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205', 'is_about': 'IAO:00000136', 'has_subsequence': 'RO:0002524', 'is_subsequence_of': 'RO:0002525', 'has_staining_intensity': 'GENO:0000207', # was GENO:0000626 (staining_intensity), # but changing to has_sequence_attribute 'upstream_of_sequence_of': 'RO:0002528', 'downstream_of_sequence_of': 'RO:0002529' } data_properties = { 'position': 'faldo:position', } annotation_properties = {} properties = object_properties.copy() properties.update(data_properties) properties.update(annotation_properties) types = { 'region': 'faldo:Region', 'Position': 'faldo:Position', # big P for Position type. little p for position property 'FuzzyPosition': 'faldo:FuzzyPosition', 'chromosome': 'SO:0000340', 'chromosome_arm': 'SO:0000105', 'chromosome_band': 'SO:0000341', 'chromosome_part': 'SO:0000830', 'long_chromosome_arm': 'GENO:0000629', 'short_chromosome_arm': 'GENO:0000628', 'chromosome_region': 'GENO:0000614', 'chromosome_subband': 'GENO:0000616', 'centromere': 'SO:0000577', 'plus_strand': 'faldo:PlusStrandPosition', 'minus_strand': 'faldo:MinusStrandPosition', 'both_strand': 'faldo:BothStrandPosition', 'score': 'SO:0001685', # FIXME - score is not a good solution, too generic 'reference_genome': 'SO:0001505', 'genome': 'SO:0001026', 'assembly_component': 'SO:0000143', 'SNP': 'SO:0000694', # the following are sequence attributes: 'band_intensity': 'GENO:0000618', 'gneg': 'GENO:0000620', 'gpos': 'GENO:0000619', 'gpos100': 'GENO:0000622', 'gpos75': 'GENO:0000623', 'gpos50': 'GENO:0000624', 'gpos25': 'GENO:0000625', 'gvar': 'GENO:0000621', 'gpos33': 'GENO:0000633', 'gpos66': 'GENO:0000632' } def __init__(self, id, label, type, description=None): self.id = id self.label = label self.type = type self.description = description self.gu = GraphUtils(curie_map.get()) self.start = None self.stop = None self.nobnodes = True # TODO remove this before official release return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.types['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.types['plus_strand'] elif strand == '-': strand_id = self.types['minus_strand'] elif strand == '.': strand_id = self.types['both_strand'] elif strand is None: # assume this is Unknown pass else: logger.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, graph, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.gu.addClassToGraph(graph, self.id, self.label, self.type, self.description) else: self.gu.addIndividualToGraph(graph, self.id, self.label, self.type, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and \ self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes( self.start['type']) if self.stop is not None and\ self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_'+rid+"-Region" region_id = rid if self.nobnodes: region_id = ':'+region_id self.gu.addTriple(graph, self.id, self.properties['location'], region_id) self.gu.addIndividualToGraph( graph, region_id, None, 'faldo:Region') else: region_id = self.id self.gu.addType(graph, region_id, 'faldo:Region') # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId(self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph(graph, self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph(graph, self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(graph, region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.types['plus_strand'] in tylist: strand = 'plus' elif self.types['minus_strand'] in tylist: strand = 'minus' elif self.types['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: logger.error("Trying to make position with no reference.") return None i = '_' if self.nobnodes: i = ':'+i reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) i += reference if coordinate is not None: # just in case it isn't a string already i = '-'.join((i, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: i = '-'.join((i, tstring)) return i def addRegionPositionToGraph( self, graph, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # logger.warn( # "No begin position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['begin'], begin_position_id) if end_position_id is None: pass # logger.warn("No end position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['end'], end_position_id) return def addPositionToGraph( self, graph, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ iid = self._makePositionId(reference_id, position, position_types) n = self.gu.getNode(iid) pos = self.gu.getNode(self.properties['position']) ref = self.gu.getNode(self.properties['reference']) if position is not None: graph.add((n, pos, Literal(position, datatype=XSD['integer']))) graph.add((n, ref, self.gu.getNode(reference_id))) if position_types is not None: for t in position_types: graph.add((n, RDF['type'], self.gu.getNode(t))) s = None if strand is not None: s = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it s = self._getStrandType(strand) # else: # s = self.types['both_strand'] if s is None and (position_types is None or position_types == []): s = self.types['Position'] if s is not None: graph.add((n, RDF['type'], self.gu.getNode(s))) return iid def addSubsequenceOfFeature(self, graph, parentid): """ This will add reciprocal triples like: feature is_subsequence_of parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.gu.addTriple( graph, self.id, self.properties['is_subsequence_of'], parentid) self.gu.addTriple( graph, parentid, self.properties['has_subsequence'], self.id) return def addTaxonToFeature(self, graph, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ # TEC: should taxon be set in __init__()? self.taxon = taxonid self.gu.addTriple( graph, self.id, Assoc.properties['in_taxon'], self.taxon) return def loadAllProperties(self, graph): prop_dict = { Assoc(None).ANNOTPROP: self.annotation_properties, Assoc(None).OBJECTPROP: self.object_properties, Assoc(None).DATAPROP: self.data_properties } for p in prop_dict: self.gu.loadProperties(graph, prop_dict.get(p), p) return def addFeatureProperty(self, graph, property_type, property): self.gu.addTriple(graph, self.id, property_type, property) return def setNoBNodes(self, nobnodes): self.nobnodes = nobnodes return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ # special genotype parts mapped to their GENO and SO classes that we explicitly reference here genoparts = { 'intrinsic_genotype': 'GENO:0000000', 'extrinsic_genotype': 'GENO:0000524', 'effective_genotype': 'GENO:0000525', 'genomic_background': 'GENO:0000611', 'genomic_variation_complement': 'GENO:0000009', 'karyotype_variation_complement': 'GENO:0000644', 'variant_single_locus_complement': 'GENO:0000030', 'variant_locus': 'GENO:0000002', 'reference_locus': 'GENO:0000036', 'allele': 'GENO:0000008', 'gene': 'SO:0000704', 'QTL': 'SO:0000771', 'transgene': 'SO:0000902', 'pseudogene': 'SO:0000336', 'cytogenetic marker': 'SO:0000341', 'sequence_feature': 'SO:0000110', 'sequence_alteration': 'SO:0001059', 'insertion': 'SO:0000667', 'deletion': 'SO:0000159', 'substitution': 'SO:1000002', 'duplication': 'SO:1000035', 'translocation': 'SO:0000199', 'inversion': 'SO:1000036', 'tandem_duplication': 'SO:1000173', 'point_mutation': 'SO:1000008', 'population': 'PCO:0000001', # population 'family': 'PCO:0000020', # family 'wildtype': 'GENO:0000511', 'reagent_targeted_gene': 'GENO:0000504', 'targeted_gene_subregion' : 'GENO:0000534', 'targeted_gene_complement' : 'GENO:0000527', 'biological_region' : 'SO:0001411', 'missense_variant': 'SO:0001583', 'transcript': 'SO:0000233', 'polypeptide': 'SO:0000104', 'cDNA': 'SO:0000756', 'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118', 'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125', 'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120', 'sequence_variant_affecting_polypeptide_function': 'SO:1000117', 'regulatory_transgene_feature': 'GENO:0000638', 'coding_transgene_feature': 'GENO:0000637', 'protein_coding_gene': 'SO:0001217', 'ncRNA_gene': 'SO:0001263' } object_properties = { 'is_mutant_of': 'GENO:0000440', 'derives_from': 'RO:0001000', 'has_alternate_part': 'GENO:0000382', 'has_reference_part': 'GENO:0000385', 'in_taxon': 'RO:0002162', 'has_zygosity': 'GENO:0000608', 'is_sequence_variant_instance_of': 'GENO:0000408', # links a alternate locus (instance) to a gene (class) 'targets_instance_of': 'GENO:0000414', 'is_reference_instance_of': 'GENO:0000610', 'has_part': 'BFO:0000051', 'has_member_with_allelotype': 'GENO:0000225', # use this when relating populations 'is_allelotype_of': 'GENO:0000206', 'has_genotype': 'GENO:0000222', 'has_phenotype': 'RO:0002200', 'transcribed_to': 'RO:0002205', 'translates_to': 'RO:0002513', 'is_targeted_expression_variant_of' : 'GENO:0000443', 'is_transgene_variant_of': 'GENO:0000444', 'has_expression-variant_part' : 'GENO:0000532', 'targeted_by' : 'GENO:0000634', # between a (reagent-targeted gene) and a morpholino 'derives_sequence_from_gene': 'GENO:0000639', # FIXME should this just be subsequence of? 'feature_to_gene_relation': 'GENO:0000418' } annotation_properties = { # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21 'reference_nucleotide': 'GENO:reference_nucleotide', # Made up term 'reference_amino_acid': 'GENO:reference_amino_acid', # Made up term 'altered_nucleotide': 'GENO:altered_nucleotide', # Made up term 'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' # Made up term } zygosity = { 'homoplasmic': 'GENO:0000602', 'heterozygous': 'GENO:0000135', 'indeterminate': 'GENO:0000137', 'heteroplasmic': 'GENO:0000603', 'hemizygous-y': 'GENO:0000604', 'hemizygous-x': 'GENO:0000605', 'homozygous': 'GENO:0000136', 'hemizygous': 'GENO:0000606', 'complex_heterozygous': 'GENO:0000402', 'simple_heterozygous': 'GENO:0000458' } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.genoparts['intrinsic_genotype'] self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if (allele_type is None): allele_type = self.genoparts['allele'] #TODO is this a good idea? self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description) return def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None): if gene_type is None: gene_type = self.genoparts['gene'] # genes are classes self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if (rel_id is None): rel_id = self.properties['is_sequence_variant_instance_of'] self.gu.addTriple(self.graph, allele_id, rel_id, gene_id) return def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None): """ Add gene/variant/allele transcribes_to relationship :param variant_id: :param transcript_id: :param transcript_label: :param transcript_type: :return: """ self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type) self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id) return def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.genoparts['polypeptide'] self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id) return def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 gu = self.gu vslc = gu.getNode(vslc_id) if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.zygosity['homozygous'] else: zygosity_id = self.zygosity['heterozygous'] if zygosity_id is not None: gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts(vslc_id, parent_id, self.properties['has_alternate_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.properties['has_part'] self.gu.addTriple(self.graph, parent_id, part_relationship, part_id) return def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.genoparts['sequence_alteration'] self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.properties['has_alternate_part']) return def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype(self, background_id, genotype_id): self.gu.addType(self.graph, background_id, self.genoparts['genomic_background']) self.addParts(background_id, genotype_id, self.object_properties['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ in_taxon = self.gu.getNode(self.properties['in_taxon']) s = self.gu.getNode(genopart_id) self.graph.add((s, in_taxon, self.gu.getNode(taxon_id))) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype p = self.object_properties['has_expression-variant_part'] self.gu.addTriple(self.graph, genotype_id, p, reagent_id) return def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description) self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id) return def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_instance_of <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if (targeted_gene_id is None): targeted_gene_id = '_' + gene_id + '-' + reagent_id self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label, self.genoparts['reagent_targeted_gene'], description) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['is_targeted_expression_variant_of'], gene_id) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id) return def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.genoparts['targeted_gene_subregion'] self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.gu.addTriple(self.graph, population_id, self.properties['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.genoparts['targeted_gene_complement'] self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_id, taxon_label=None): if taxon_label is None: taxon_label = taxon_id genome_label = taxon_label+' genome' genome_id = self.makeGenomeID(taxon_id) self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome']) self.gu.addType(self.graph, build_id, genome_id) self.addTaxon(taxon_id, build_id) return def makeGenomeID(self, taxon_id): # scrub off the taxon prefix. put it in base space genome_id = re.sub('.*\:', ':', taxon_id) + 'genome' return genome_id def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None): # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. # if a build is included, punn the chromosome as a subclass of SO:chromsome, and # make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the # build or genome. # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: chrinbuild_id = makeChromID(chr, build_id) # the build-specific chromosome if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, build_id, chrinbuild_id) self.gu.addMemberOf(self.graph, chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # the chrom class (generic) id chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label, Feature.types['chromosome']) return def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.gu.addType(self.graph, chr_id, chr_type) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, reference_id, chr_id) self.gu.addMemberOf(self.graph, chr_id, reference_id) return def make_variant_locus_label(self, gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip()+'<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if (gene_label is None and allele1_label is None and allele2_label is None): logger.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label
class Assoc: """ An abstract class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = { 'association': 'OBAN:association' } annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', } object_properties = { 'has_disposition': 'GENO:0000208', 'has_phenotype': 'RO:0002200', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_object_property', 'is_about': 'IAO:00000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) OWLCLASS = OWL['Class'] OWLIND = OWL['NamedIndividual'] OBJECTPROP = OWL['ObjectProperty'] ANNOTPROP = OWL['AnnotationProperty'] DATAPROP = OWL['DatatypeProperty'] SUBCLASS = RDFS['subClassOf'] BASE = Namespace(curie_map.get()['']) def __init__(self, definedby): self.cu = CurieUtil(curie_map.get()) self.gu = GraphUtils(curie_map.get()) # core parts of the association self.definedby = definedby self.sub = self.obj = self.rel = None self.assoc_id = None self.description = None self.source = [] self.evidence = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self, g): if not self._is_valid(): return # first, add the direct triple # anonymous (blank) nodes are indicated with underscore s = self.gu.getNode(self.sub) o = self.gu.getNode(self.obj) p = self.gu.getNode(self.rel) if s is None: logging.error( "Unable to retrieve graph node for Subject %s ", self.sub) return elif p is None: logging.error( "Unable to retrieve graph node for Predicate %s ", self.rel) return elif o is None: logging.error( "Unable to retrieve graph node for Object %s ", self.obj) return else: g.add((s, p, o)) if self.assoc_id is None: self.set_association_id() node = self.gu.getNode(self.assoc_id) g.add((node, RDF['type'], self.gu.getNode(self.assoc_types['association']))) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_subject'], self.sub) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_object'], self.obj) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.gu.addDescription(g, self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s, True) else: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_provenance'], p) if self.score is not None: self.gu.addTriple( g, self.assoc_id, self.properties['has_measurement'], Literal(self.score, datatype=XSD['float']), True) # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self, g): self._add_basic_association_to_graph(g) return def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return def load_all_properties(self, g): props = { self.OBJECTPROP: self.object_properties, self.ANNOTPROP: self.annotation_properties, self.DATAPROP: self.datatype_properties } for p in props: self.gu.loadProperties(g, props[p], p) return def _get_source_uri(self, pub_id): """ Given some kind of pub_id (which might be a CURIE or url), convert it into a proper node. :param pub_id: :return: source: Well-formed URI for the given identifier (or url) """ source = None if re.compile('http').match(pub_id): source = URIRef(pub_id) else: u = self.gu.getNode(pub_id) if u is not None: source = URIRef(u) else: logger.error( "An id we don't know how to deal with: %s", pub_id) return source @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively md5 hashes the (+)-joined string from the values. Subclasses of Assoc can submit an additional array of attributes that will be added to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # TEC: at our scale, md5 is in danger of having collisions. # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))