def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph() elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format. # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M') self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple(self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return
def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True data_release_version=None, name=None, # identifier; make an URI for nquads ingest_title=None, ingest_url=None, ingest_logo=None, # this should be the name of file on 'MonarchLogoRepo' ingest_description=None, license_url=None, # only if it is _our_ lic data_rights=None, # their page that points to their current lic file_handle=None, ): # pull in the common test identifiers self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml') self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.data_release_version = data_release_version self.ingest_title = ingest_title self.ingest_url = ingest_url self.ingest_logo = ingest_logo self.ingest_description = ingest_description self.license_url = license_url self.data_rights = data_rights self.localtt = self.load_local_translationtable(name) self.remote_file_timestamps = dict() if name is not None: self.name = name.lower() elif self.whoami() is not None: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.test_only = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = '/'.join(('raw', self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) self.datasetfile = None # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) raw_pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, raw_pth) # else: # raw data dir does exist. maybe should consider what is in it? # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) out_pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", out_pth) else: out_pth = os.path.abspath(self.outdir) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need skolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats dest_file = open(out_pth + '/' + name + '.nt', 'w') # where is the close? self.graph = StreamedGraph(are_bnodes_skized, dest_file) # leave test files as turtle (better human readibility) else: LOG.error( "%s graph type not supported\n" "valid types: rdf_graph, streamed_graph", graph_type) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # self.prefix_base = {v: k for k, v in self.curie_map.items()} # will be set to True if the intention is # to only process and write the test data self.test_only = False self.test_mode = False if self.ingest_description and getdoc(self) is not None: self.ingest_description = getdoc(self) self.dataset = Dataset( identifier=self.name, data_release_version=self.data_release_version, ingest_name=self.name, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo, ingest_description=self.ingest_description, # description license_url=self.license_url, # only _OUR_ lic data_rights=self.data_rights, # tries to point to others lics graph_type=graph_type, file_handle=file_handle ) # see jenkins file human, mouse, zebrafish, fly, worm rat self.COMMON_TAXON = ['9606','10090','7955','7227','6239'] # '10116'
def __init__( self, identifier, # name? should be Archive url via Source title, url, ingest_desc=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, identifier, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, identifier) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # TODO: move hard coded curies to translation table calls self.identifier = identifier if title is None: self.title = identifier else: self.title = title self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dcterms:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license_url = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dcterms:title', title, True) self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier, True) if url is not None: self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <uri> # TODO add the license info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dcterms:license', license_url) else: LOG.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dcterms:rights', data_rights, object_is_literal=True) else: LOG.debug('No rights provided.') if ingest_desc is not None: self.model.addDescription(self.identifier, ingest_desc) return
def __init__(self, graph_type, are_bnodes_skized=False, name=None): self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized if name is not None: logger.info("Processing Source \"%s\"", name) self.testOnly = False self.name = name self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = 'raw' self.dataset = None # set to True if you want to materialze identifiers for BNodes if self.name is not None: self.rawdir = '/'.join((self.rawdir, self.name)) # This is redundant when it is not wrong see write() # self.outfile = '/'.join((self.outdir, self.name + ".ttl")) # logger.info("Setting outfile to %s", self.outfile) self.testfile = '/'.join((self.outdir, self.name + "_test.ttl")) logger.info("Setting testfile to %s", self.testfile) self.datasetfile = '/'.join( (self.outdir, self.name + '_dataset.ttl')) logger.info("Setting dataset file to %s", self.datasetfile) # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) p = os.path.abspath(self.rawdir) logger.info("creating raw directory for %s at %s", self.name, p) # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) p = os.path.abspath(self.outdir) logger.info("created output directory %s", p) if graph_type == 'rdf_graph': self.graph = RDFGraph(are_bnodes_skized) # TODO named graph IRI? self.testgraph = RDFGraph(True) elif graph_type == 'streamed_graph': source_file = open(self.outfile.replace(".ttl", ".nt"), 'w') test_file = open(self.testfile.replace(".ttl", ".nt"), 'w') self.graph = StreamedGraph(are_bnodes_skized, source_file) self.testgraph = StreamedGraph(are_bnodes_skized, test_file) else: logger.error( "{} graph type not supported\n" "valid types: rdf_graph, streamed_graph".format(graph_type)) # will be set to True if the intention is # to only process and write the test data self.testOnly = False self.testMode = False for g in [self.graph, self.testgraph]: self.declareAsOntology(g) return
def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True name=None, # identifier; make an IRI for nquads ingest_title=None, ingest_url=None, license_url=None, # only if it is _our_ lic data_rights=None, # external page that points to their current lic file_handle=None): # pull in the common test identifiers self.all_test_ids = self.open_and_parse_yaml( '../../resources/test_ids.yaml') self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.ingest_url = ingest_url self.ingest_title = ingest_title self.localtt = self.load_local_translationtable(name) if name is not None: self.name = name.lower() elif self.whoami() is not None: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.test_only = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = 'raw' self.rawdir = '/'.join((self.rawdir, self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) self.datasetfile = None # still need to pull in file suffix -- this ia a curie not a url self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl' # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, pth) # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", pth) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need slolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats dest_file = open(pth + '/' + name + '.nt', 'w') # where is the close? self.graph = StreamedGraph(are_bnodes_skized, dest_file) # leave test files as turtle (better human readibility) else: LOG.error( "%s graph type not supported\n" "valid types: rdf_graph, streamed_graph", graph_type) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # self.prefix_base = {v: k for k, v in self.curie_map.items()} # will be set to True if the intention is # to only process and write the test data self.test_only = False self.test_mode = False # this may eventually support Bagits self.dataset = Dataset( self.archive_url, self.ingest_title, self.ingest_url, None, # description license_url, # only _OUR_ lic data_rights, # tries to point to others lics graph_type, file_handle) for graph in [self.graph, self.testgraph]: self.declareAsOntology(graph)
def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples()
def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True name=None, # identifier; make an IRI for nquads ingest_title=None, ingest_url=None, license_url=None, data_rights=None, file_handle=None ): self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.ingest_url = ingest_url self.ingest_title = ingest_title self.localtt = self.load_local_translationtable(name) if name is not None: self.name = name else: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.testOnly = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = 'raw' self.rawdir = '/'.join((self.rawdir, self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) # still need to pull in file suffix self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl' # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, pth) # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", pth) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need slolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats source_file = open(self.outfile.replace(".ttl", ".nt"), 'w') self.graph = StreamedGraph(are_bnodes_skized, source_file) # leave test files as turtle (better human readibility) else: LOG.error( "{} graph type not supported\n" "valid types: rdf_graph, streamed_graph".format(graph_type)) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid # self.curie_map = self.graph.curie_map # will be set to True if the intention is # to only process and write the test data self.testOnly = False self.testMode = False # this may eventually support Bagits self.dataset = Dataset( self.archive_url, self.ingest_title, self.ingest_url, None, # description license_url, data_rights, graph_type, file_handle ) for g in [self.graph, self.testgraph]: self.declareAsOntology(g) return