def addDataset(self, identifier, doc=None): """Adds a dataset to the repository. Parameters: ----------- identifier : str Non-urlencoded DataOne identifier doc : XML Element An XML element containing a result from the Solr index which contains a number of fields relating to a dataset. """ if self.model is not None: raise Exception("Model existed when addDataset was called. This means the last Model wasn't cleaned up after finishing.") self.createModel() # Get Solr fields if they weren't passed in if doc is None: doc = dataone.getSolrIndexFields(identifier) identifier = dataone.extractDocumentIdentifier(doc) identifier_esc = urllib.quote_plus(identifier) dataset_node = RDF.Uri(self.repository.ns['d1dataset'] + identifier_esc) self.add(dataset_node, 'rdf:type', 'geolink:Dataset') # Delete if dataset is already in graph if self.datasetExists(identifier): logging.info("Dataset with identifier %s already exists. Deleting then re-adding.", identifier) self.deleteDataset(identifier) scimeta = dataone.getScientificMetadata(identifier) records = processing.extractCreators(identifier, scimeta) vld = validator.Validator() # Add Dataset triples first, we'll use them when we add people # to match to existing people by the current dataset's 'obsoletes' field self.addDatasetTriples(dataset_node, doc) # Add people and organizations people = [p for p in records if 'type' in p and p['type'] == 'person'] organizations = [o for o in records if 'type' in o and o['type'] == 'organization'] # Always do organizations first, so peoples' organization URIs exist for organization in organizations: organization = vld.validate(organization) self.addOrganization(organization) for person in people: person = vld.validate(person) self.addPerson(person) # Commit or reject the model here if self.model is None: raise Exception("Model was None. It should have been an RDF.Model.") self.insertModel() self.model = None # Remove the model since we're done
def addDatasetTriples(self, dataset_node, doc): if self.model is None: raise Exception("Model not found.") identifier = dataone.extractDocumentIdentifier(doc) identifier_esc = urllib.quote_plus(identifier) # type Dataset self.add(dataset_node, 'rdf:type', 'geolink:Dataset') # Title title_element = doc.find("./str[@name='title']") if title_element is not None: self.add(dataset_node, 'rdfs:label', RDF.Node(title_element.text)) # Add geolink:Identifier self.addIdentifierTriples(dataset_node, identifier) # Abstract abstract_element = doc.find("./str[@name='abstract']") if (abstract_element is not None): self.add(dataset_node, 'geolink:description', RDF.Node(abstract_element.text)) # Spatial Coverage bound_north = doc.find("./float[@name='northBoundCoord']") bound_east = doc.find("./float[@name='eastBoundCoord']") bound_south = doc.find("./float[@name='southBoundCoord']") bound_west = doc.find("./float[@name='westBoundCoord']") if all(ele is not None for ele in [bound_north, bound_east, bound_south, bound_west]): if bound_north.text == bound_south.text and bound_west.text == bound_east.text: wktliteral = "POINT (%s %s)" % (bound_north.text, bound_east.text) else: wktliteral = "POLYGON ((%s %s, %s %s, %s %s, %s, %s))" % (bound_west.text, bound_north.text, bound_east.text, bound_north.text, bound_east.text, bound_south.text, bound_west.text, bound_south.text) self.add(dataset_node, 'geolink:hasGeometryAsWktLiteral', RDF.Node(wktliteral)) # Temporal Coverage begin_date = doc.find("./date[@name='beginDate']") end_date = doc.find("./date[@name='endDate']") if begin_date is not None: self.add(dataset_node, 'geolink:hasStartDate', RDF.Node(begin_date.text)) if end_date is not None: self.add(dataset_node, 'geolink:hasEndDate', RDF.Node(end_date.text)) # Obsoletes as PROV#wasRevisionOf obsoletes_node = doc.find("./str[@name='obsoletes']") if obsoletes_node is not None: other_document_esc = urllib.quote_plus(obsoletes_node.text) self.add(dataset_node, 'prov:wasRevisionOf', RDF.Uri(self.repository.ns['d1dataset'] + other_document_esc)) # Landing page self.add(dataset_node, 'geolink:hasLandingPage', RDF.Uri("https://search.dataone.org/#view/" + identifier_esc)) # Digital Objects # If this document has a resource map, get digital objects from there # Otherwise, use the cito:documents field in Solr resource_map_identifiers = doc.findall("./arr[@name='resourceMap']/str") if len(resource_map_identifiers) > 0: for resource_map_node in resource_map_identifiers: resource_map_identifier = resource_map_node.text digital_objects = dataone.getAggregatedIdentifiers(resource_map_identifier) for digital_object in digital_objects: digital_object_identifier = urllib.unquote_plus(digital_object) self.addDigitalObject(identifier, digital_object_identifier) else: # If no resourceMap or documents field, at least add the metadata # file as a digital object # dataUrl e.g. https://cn.dataone.org/cn/v1/resolve/doi%3A10.6073%2FAA%2Fknb-lter-cdr.70061.123 data_url_node = doc.find("./str[@name='dataUrl']") if data_url_node is not None: data_url = data_url_node.text digital_object = dataone.extractIdentifierFromFullURL(data_url) digital_object = urllib.unquote_plus(digital_object) self.addDigitalObject(identifier, digital_object)
def update_graph(): """Update the graph with datasets that have been modified since the last time the job was run. This job updates in chunks of UPDATE_CHUNK_SIZE. The reason for this is to avoid long-running jobs. """ JOB_NAME = "JOB_UPDATE" logging.info("[%s] Job started.", JOB_NAME) """Determine the time period over which to get datasets. If the Redis database is fresh and does not have a value set for REDIS_LAST_RUN_KEY, we initialize the key with the datetime string that is earlier than the first uploaded dataset. """ from_string = getLastRun() if from_string is None: from_string = setLastRun() """Adjust from_string to be one millisecond later than what was stored This is done because Solr's range query criteria are range-inclusive and not adding a millisecond to this value would make the result set include the last document from the previous update job which would double-add the dataset. """ try: from_string_dt = parse(from_string) + datetime.timedelta( milliseconds=1) except: raise Exception( "Failed to parse and add timedelta to from_string of %s." % from_string) from_string = from_string_dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") to_string = getNowString( ) # Always just get all datasets since from_string logging.info("[%s] Running update job: from_string=%s to_string=%s", JOB_NAME, from_string, to_string) # Return now if the queue is too large if len(queues['dataset']) > QUEUE_MAX_SIZE: logging.info( "[%s] Ending update job early because dataset queue is too large (%d).", JOB_NAME, len(queues['dataset'])) return # Create the Solr query to grab the datasets query_string = dataone.createSinceQueryURL(from_string, to_string, None, 0) num_results = dataone.getNumResults(query_string) logging.info("[%s] num_results=%d", JOB_NAME, num_results) # Set up Graph/Interface once per update job # This ensures that all add_dataset jobs use the same instance of each # which reduces uncessary overhead graph = Graph(host=VIRTUOSO_HOST, port=VIRTUOSO_PORT, name=VIRTUOSO_GRAPH, ns=NAMESPACES) interface = Interface(graph) # Get first page of size UPDATE_CHUNK_SIZE page_xml = dataone.getSincePage(from_string, to_string, 1, UPDATE_CHUNK_SIZE) docs = page_xml.findall(".//doc") if docs is None or len(docs) <= 0: logging.info("[%s] No datasets added since last update.", JOB_NAME) return for doc in docs: identifier = dataone.extractDocumentIdentifier(doc) logging.info("[%s] Queueing job add_dataset with identifier='%s'", JOB_NAME, identifier) queues['dataset'].enqueue(add_dataset, graph, interface, identifier, doc) logging.info("[%s] Done queueing datasets.", JOB_NAME) # Get sysmeta modified string for the last document in the sorted list last_modified = docs[len(docs) - 1] last_modified_ele = last_modified.find("./date[@name='dateModified']") if last_modified_ele is None: raise Exception("Solr result did not contain a dateModified element.") last_modified_value = last_modified_ele.text if last_modified_value is None or len(last_modified_value) <= 0: raise Exception( "Last document's dateModified value was None or length zero.") logging.info('[%s] Setting lastrun key to %s.', JOB_NAME, last_modified_value) setLastRun(last_modified_value) # Update the void file if we updated the graph if docs is not None and len(docs) > 0: logging.info( "[%s] Updating VoID file located at VOID_FILEPATH='%s' with new modified value of to_string='%s'.", JOB_NAME, VOID_FILEPATH, last_modified_value) updateVoIDFile(last_modified_value)