예제 #1
0
    def addDataset(self, identifier, doc=None):
        """Adds a dataset to the repository.

        Parameters:
        -----------
        identifier : str
            Non-urlencoded DataOne identifier

        doc : XML Element
            An XML element containing a result from the Solr index which
            contains a number of fields relating to a dataset.

        """

        if self.model is not None:
            raise Exception("Model existed when addDataset was called. This means the last Model wasn't cleaned up after finishing.")

        self.createModel()

        # Get Solr fields if they weren't passed in
        if doc is None:
            doc = dataone.getSolrIndexFields(identifier)

        identifier = dataone.extractDocumentIdentifier(doc)
        identifier_esc = urllib.quote_plus(identifier)

        dataset_node = RDF.Uri(self.repository.ns['d1dataset'] + identifier_esc)

        self.add(dataset_node, 'rdf:type', 'geolink:Dataset')

        # Delete if dataset is already in graph
        if self.datasetExists(identifier):
            logging.info("Dataset with identifier %s already exists. Deleting then re-adding.", identifier)
            self.deleteDataset(identifier)

        scimeta = dataone.getScientificMetadata(identifier)
        records = processing.extractCreators(identifier, scimeta)

        vld = validator.Validator()

        # Add Dataset triples first, we'll use them when we add people
        # to match to existing people by the current dataset's 'obsoletes' field

        self.addDatasetTriples(dataset_node, doc)

        # Add people and organizations
        people = [p for p in records if 'type' in p and p['type'] == 'person']
        organizations = [o for o in records if 'type' in o and o['type'] == 'organization']

        # Always do organizations first, so peoples' organization URIs exist
        for organization in organizations:
            organization = vld.validate(organization)
            self.addOrganization(organization)

        for person in people:
            person = vld.validate(person)
            self.addPerson(person)

        # Commit or reject the model here
        if self.model is None:
            raise Exception("Model was None. It should have been an RDF.Model.")

        self.insertModel()
        self.model = None  # Remove the model since we're done
예제 #2
0
    def addDatasetTriples(self, dataset_node, doc):
        if self.model is None:
            raise Exception("Model not found.")

        identifier = dataone.extractDocumentIdentifier(doc)
        identifier_esc = urllib.quote_plus(identifier)

        # type Dataset
        self.add(dataset_node, 'rdf:type', 'geolink:Dataset')

        # Title
        title_element = doc.find("./str[@name='title']")

        if title_element is not None:
            self.add(dataset_node, 'rdfs:label', RDF.Node(title_element.text))

        # Add geolink:Identifier
        self.addIdentifierTriples(dataset_node, identifier)

        # Abstract
        abstract_element = doc.find("./str[@name='abstract']")

        if (abstract_element is not None):
            self.add(dataset_node, 'geolink:description', RDF.Node(abstract_element.text))

        # Spatial Coverage
        bound_north = doc.find("./float[@name='northBoundCoord']")
        bound_east = doc.find("./float[@name='eastBoundCoord']")
        bound_south = doc.find("./float[@name='southBoundCoord']")
        bound_west = doc.find("./float[@name='westBoundCoord']")

        if all(ele is not None for ele in [bound_north, bound_east, bound_south, bound_west]):
            if bound_north.text == bound_south.text and bound_west.text == bound_east.text:
                wktliteral = "POINT (%s %s)" % (bound_north.text, bound_east.text)
            else:
                wktliteral = "POLYGON ((%s %s, %s %s, %s %s, %s, %s))" % (bound_west.text, bound_north.text, bound_east.text, bound_north.text, bound_east.text, bound_south.text, bound_west.text, bound_south.text)

            self.add(dataset_node, 'geolink:hasGeometryAsWktLiteral', RDF.Node(wktliteral))

        # Temporal Coverage
        begin_date = doc.find("./date[@name='beginDate']")
        end_date = doc.find("./date[@name='endDate']")

        if begin_date is not None:
            self.add(dataset_node, 'geolink:hasStartDate', RDF.Node(begin_date.text))

        if end_date is not None:
            self.add(dataset_node, 'geolink:hasEndDate', RDF.Node(end_date.text))

        # Obsoletes as PROV#wasRevisionOf
        obsoletes_node = doc.find("./str[@name='obsoletes']")

        if obsoletes_node is not None:
            other_document_esc = urllib.quote_plus(obsoletes_node.text)
            self.add(dataset_node, 'prov:wasRevisionOf', RDF.Uri(self.repository.ns['d1dataset'] + other_document_esc))

        # Landing page
        self.add(dataset_node, 'geolink:hasLandingPage', RDF.Uri("https://search.dataone.org/#view/" + identifier_esc))


        # Digital Objects
        # If this document has a resource map, get digital objects from there
        # Otherwise, use the cito:documents field in Solr

        resource_map_identifiers = doc.findall("./arr[@name='resourceMap']/str")

        if len(resource_map_identifiers) > 0:
            for resource_map_node in resource_map_identifiers:
                resource_map_identifier = resource_map_node.text

                digital_objects = dataone.getAggregatedIdentifiers(resource_map_identifier)

                for digital_object in digital_objects:
                    digital_object_identifier = urllib.unquote_plus(digital_object)
                    self.addDigitalObject(identifier, digital_object_identifier)
        else:
            # If no resourceMap or documents field, at least add the metadata
            # file as a digital object
            # dataUrl e.g. https://cn.dataone.org/cn/v1/resolve/doi%3A10.6073%2FAA%2Fknb-lter-cdr.70061.123

            data_url_node = doc.find("./str[@name='dataUrl']")

            if data_url_node is not None:
                data_url = data_url_node.text
                digital_object = dataone.extractIdentifierFromFullURL(data_url)
                digital_object = urllib.unquote_plus(digital_object)

                self.addDigitalObject(identifier, digital_object)
예제 #3
0
def update_graph():
    """Update the graph with datasets that have been modified since the last
    time the job was run. This job updates in chunks of UPDATE_CHUNK_SIZE. The
    reason for this is to avoid long-running jobs.
    """
    JOB_NAME = "JOB_UPDATE"
    logging.info("[%s] Job started.", JOB_NAME)
    """Determine the time period over which to get datasets.

    If the Redis database is fresh and does not have a value set for
    REDIS_LAST_RUN_KEY, we initialize the key with the datetime string that is
    earlier than the first uploaded dataset.
    """

    from_string = getLastRun()

    if from_string is None:
        from_string = setLastRun()
    """Adjust from_string to be one millisecond later than what was stored
    This is done because Solr's range query criteria are range-inclusive and
    not adding a millisecond to this value would make the result set include
    the last document from the previous update job which would double-add
    the dataset.
    """

    try:
        from_string_dt = parse(from_string) + datetime.timedelta(
            milliseconds=1)
    except:
        raise Exception(
            "Failed to parse and add timedelta to from_string of %s." %
            from_string)

    from_string = from_string_dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    to_string = getNowString(
    )  # Always just get all datasets since from_string
    logging.info("[%s] Running update job: from_string=%s to_string=%s",
                 JOB_NAME, from_string, to_string)

    # Return now if the queue is too large
    if len(queues['dataset']) > QUEUE_MAX_SIZE:
        logging.info(
            "[%s] Ending update job early because dataset queue is too large (%d).",
            JOB_NAME, len(queues['dataset']))
        return

    # Create the Solr query to grab the datasets
    query_string = dataone.createSinceQueryURL(from_string, to_string, None, 0)
    num_results = dataone.getNumResults(query_string)
    logging.info("[%s] num_results=%d", JOB_NAME, num_results)

    # Set up Graph/Interface once per update job
    # This ensures that all add_dataset jobs use the same instance of each
    # which reduces uncessary overhead

    graph = Graph(host=VIRTUOSO_HOST,
                  port=VIRTUOSO_PORT,
                  name=VIRTUOSO_GRAPH,
                  ns=NAMESPACES)
    interface = Interface(graph)

    # Get first page of size UPDATE_CHUNK_SIZE
    page_xml = dataone.getSincePage(from_string, to_string, 1,
                                    UPDATE_CHUNK_SIZE)
    docs = page_xml.findall(".//doc")

    if docs is None or len(docs) <= 0:
        logging.info("[%s] No datasets added since last update.", JOB_NAME)
        return

    for doc in docs:
        identifier = dataone.extractDocumentIdentifier(doc)
        logging.info("[%s] Queueing job add_dataset with identifier='%s'",
                     JOB_NAME, identifier)
        queues['dataset'].enqueue(add_dataset, graph, interface, identifier,
                                  doc)

    logging.info("[%s] Done queueing datasets.", JOB_NAME)

    # Get sysmeta modified string for the last document in the sorted list
    last_modified = docs[len(docs) - 1]
    last_modified_ele = last_modified.find("./date[@name='dateModified']")

    if last_modified_ele is None:
        raise Exception("Solr result did not contain a dateModified element.")

    last_modified_value = last_modified_ele.text

    if last_modified_value is None or len(last_modified_value) <= 0:
        raise Exception(
            "Last document's dateModified value was None or length zero.")

    logging.info('[%s] Setting lastrun key to %s.', JOB_NAME,
                 last_modified_value)
    setLastRun(last_modified_value)

    # Update the void file if we updated the graph
    if docs is not None and len(docs) > 0:
        logging.info(
            "[%s] Updating VoID file located at VOID_FILEPATH='%s' with new modified value of to_string='%s'.",
            JOB_NAME, VOID_FILEPATH, last_modified_value)
        updateVoIDFile(last_modified_value)