def __do_three_step_update(self, row, column_name, uri, path, data_update): """ Given the current state in the update, and a path length three column_def, add, change or delete intermediate and end objects as necessary to perform the requested update :param row: row number of the update. For printing :param column_name: column_name of the update. For printing :param uri: uri of the entity at the head of the path :param path: the column definition :param data_update: the data provided for the update :return: Changes in the update_graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples step_def = path[0] step_uris = [ o for s, p, o in get_step_triples(self.update_graph, uri, column_name, step_def, self.query_parms, self.verbose) ] if len(step_uris) == 0: # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add( (uri, step_def['predicate']['ref'], step_uri)) self.update_graph.add( (step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add( (step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) elif step_def['predicate']['single'] == True: # VIVO has 1 or more values for first intermediate, so we need to see if the predicate # is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \ step_uris, "using", step_uri self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) return None
def __do_three_step_update(self, row, column_name, uri, path, data_update): """ Given the current state in the update, and a path length three column_def, add, change or delete intermediate and end objects as necessary to perform the requested update :param row: row number of the update. For printing :param column_name: column_name of the update. For printing :param uri: uri of the entity at the head of the path :param path: the column definition :param data_update: the data provided for the update :return: Changes in the update_graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples step_def = path[0] step_uris = [o for s, p, o in get_step_triples(self.update_graph, uri, column_name, step_def, self.query_parms, self.verbose)] if len(step_uris) == 0: # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) elif step_def['predicate']['single'] == True: # VIVO has 1 or more values for first intermediate, so we need to see if the predicate # is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \ step_uris, "using", step_uri self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) return None
def __do_two_step_update(self, row, column_name, uri, column_def, data_update): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :param: row: current row in spreadsheet :param: column_name: name of current column in spreadsheet :param: uri: uri in VIVO of the current entity :param: column_def: the column def for the current column :param: data_update: the column_value :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples, prepare_column_values step_def = column_def[0] # Determine the add set (which intermediates point to column values that are not yet in VIVO # For each element in the add set, construct the intermediate and call __do_the_update to # construct the leaf # Determine the sub set (which intermediates point to column values that are in VIVO and are # not in the column values # For each element in the sub set, remove the leaf and the intermediate # # This framework should also handle single valued predicates, and cases where there are no step_uris. # That is, it should handle everything. All the code below should be replaced. step_uris = [ o for s, p, o in get_step_triples(self.update_graph, uri, column_name, column_def[0], self.query_parms, self.verbose) ] vivo_objs = {} for step_uri in step_uris: for s, p, o in get_step_triples(self.update_graph, step_uri, column_name, column_def[1], self.query_parms, self.verbose): vivo_objs[unicode(o)] = [o, step_uri] # Nasty hack below. The predicate property "single" appears to have two meanings. One has to do # with the semantic graph and one has to do with the cardinality of the data column. These are not # the same. When the first step is multiple and the second single, the "second single" is not the # cardinality of the data column. The cardinality of the data column is the multiple if any of the # predicates in the path are multiple. Here we set the cardinality of the leaf to be used by # prepare_column_values and then set it back. Nasty. Create a property for the leaf cardinality. predicate2_cardinality = column_def[1]['predicate']['single'] if column_def[0]['predicate']['single'] == False: column_def[1]['predicate']['single'] = False column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row, column_name) column_def[1]['predicate']['single'] = predicate2_cardinality vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()] if unicode(column_values[0]).lower() == 'none': add_values = set() sub_values = set(vivo_values) else: add_values = set(column_values) - set(vivo_values) sub_values = set(vivo_values) - set(column_values) if self.verbose: print 'Two step SET COMPARE', '\n\tRow', row, '\n\tColumn', column_name, '\n\tSource', column_values, \ '\n\tVIVO', vivo_values, '\n\tAdd:', add_values, '\n\tSub:', sub_values, '\n\tStep_uris', step_uris # Process the adds if len(add_values) > 0: if column_def[0]['predicate']['single'] == False: # Multiple intermediaries, single valued-leaves for leaf_value in add_values: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add( (uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add( (step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add( (step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get( 'datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {}) else: # Multiple values on the single leaf if len(step_uris) == 0: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add( (uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add( (step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add( (step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get( 'datatype', None), lang=step_def['object'].get('lang', None)))) else: step_uri = step_uris[0] self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {}) # Process the subs if len(sub_values) > 0: if column_def[0]['predicate']['single'] == False: # Handle multiple intermediaries, single leaves, by removing each intermediary and all its # assertions for leaf_value in sub_values: step_uri = vivo_objs[unicode(leaf_value)][1] self.update_graph.remove( (uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) else: # Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary # Then check to see if the intermediary has any remaining leaf assertions and remove if empty step_uri = vivo_objs[unicode(next(iter(sub_values)))][1] for leaf_value in sub_values: self.update_graph.remove((step_uri, None, leaf_value)) g = self.update_graph.triples( (step_uri, column_def[1]['predicate']['ref'], None)) if g == set(): self.update_graph.remove( (uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) # if len(step_uris) == 0: # # # VIVO has no values for intermediate, so add a new intermediate and __do_the_update on the leaf # # step_uri = URIRef(new_uri(self.query_parms)) # self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) # self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) # if 'label' in step_def['object']: # self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], # datatype=step_def['object'].get('datatype', None), # lang=step_def['object'].get('lang', None)))) # uri = step_uri # step_def = column_def[1] # vivo_objs = {unicode(o): o for s, p, o in # get_step_triples(self.update_graph, uri, column_name, step_def, self.query_parms, # self.verbose)} # column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum, row, # column_name) # self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs) # # elif step_def['predicate']['single'] == True: # # # VIVO has 1 or more values, so we need to see if the predicate is expected to be single # # step_uri = step_uris[0] # if len(step_uris) > 1: # print "WARNING: Single predicate", column_name, "has", len(step_uris), "values: ", \ # step_uris, "using", step_uri # uri = step_uri # step_def = column_def[1] # vivo_objs = {unicode(o): o for s, p, o in # get_step_triples(self.update_graph, uri, column_name, step_def, self.query_parms, # self.verbose)} # column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum, row, # column_name) # self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs) # # else: # print "WARNING: Updating multi-valued multi-step predicates such as ", column_name, " not yet implemented" return None
def __do_update(self): """ For each row, process each column. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested add, change, delete """ from rdflib import URIRef, RDF from vivopump import new_uri, prepare_column_values, get_step_triples, PathLengthException merges = {} for row, data_update in self.update_data.items(): # Create a URI if empty if data_update['uri'].strip() == '': # If the source uri is empty, create one. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.query_parms) if self.verbose: print "Adding an entity for row", row, ". Will be added at", uri_string uri = URIRef(uri_string) self.update_graph.add( (uri, RDF.type, self.update_def['entity_def']['type'])) # Create a URI entity if not found else: uri = URIRef(data_update['uri'].strip()) if (uri, None, None) not in self.update_graph: if self.verbose: print "Adding an entity for row", row, ". Will be added at", str( uri) self.update_graph.add( (uri, RDF.type, self.update_def['entity_def']['type'])) entity_uri = uri action = data_update.get('action', '').lower() # Process remove action if any if action == 'remove': self.__do_remove(row, uri) continue # Collect merge info if any if action != '': k = action.find('1') if k > -1: key = action[0:k] if key not in merges: merges[key] = {} merges[key]['primary'] = None merges[key]['secondary'] = [uri] else: merges[key]['secondary'].append(uri) else: if action not in merges: merges[action] = {} merges[action]['primary'] = uri if 'secondary' not in merges[action]: merges[action]['secondary'] = [] # For this row, process all the column_defs and then process closure defs if any. Closures allow # columns to be "reused" providing additional paths from the row entity to entities in the paths. for column_name, column_def in self.update_def['column_defs'].items() + \ self.update_def.get('closure_defs', {}).items(): if column_name not in data_update: continue # extra column names are allowed in the spreadsheet for annotation uri = entity_uri if data_update[column_name] == '': continue if len(column_def) > 3: raise PathLengthException( "ERROR: Path lengths > 3 not supported. Path length for " + column_name + " is " + str(len(column_def))) elif len(column_def) == 3: self.__do_three_step_update(row, column_name, uri, column_def, data_update) elif len(column_def) == 2: self.__do_two_step_update(row, column_name, uri, column_def, data_update) elif len(column_def) == 1: step_def = column_def[0] vivo_objs = { unicode(o): o for s, p, o in get_step_triples( self.update_graph, uri, column_name, step_def, self.query_parms, self.verbose) } column_values = prepare_column_values( data_update[column_name], self.intra, step_def, self.enum, row, column_name) if self.verbose: print row, column_name, column_values, uri, vivo_objs self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs) if any(merges): self.__do_merges(merges) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original if self.verbose: print "Triples to add" print add.serialize(format='nt') sub = self.original_graph - self.update_graph # Triples in original that are not in update if self.verbose: print "Triples to sub" print sub.serialize(format='nt') return [add, sub]
def __do_two_step_update(self, row, column_name, uri, column_def, data_update): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :param: row: current row in spreadsheet :param: column_name: name of current column in spreadsheet :param: uri: uri in VIVO of the current entity :param: column_def: the column def for the current column :param: data_update: the column_value :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples, prepare_column_values step_def = column_def[0] # Determine the add set (which intermediates point to column values that are not yet in VIVO # For each element in the add set, construct the intermediate and call __do_the_update to # construct the leaf # Determine the sub set (which intermediates point to column values that are in VIVO and are # not in the column values # For each element in the sub set, remove the leaf and the intermediate # # This framework should also handle single valued predicates, and cases where there are no step_uris. # That is, it should handle everything. All the code below should be replaced. step_uris = [o for s, p, o in get_step_triples(self.update_graph, uri, column_def[0], self.query_parms)] vivo_objs = {} for step_uri in step_uris: for s, p, o in get_step_triples(self.update_graph, step_uri, column_def[1], self.query_parms): vivo_objs[unicode(o)] = [o, step_uri] # Nasty hack below. The predicate property "single" appears to have two meanings. One has to do # with the semantic graph and one has to do with the cardinality of the data column. These are not # the same. When the first step is multiple and the second single, the "second single" is not the # cardinality of the data column. The cardinality of the data column is multiple if any of the # predicates in the path are multiple. Here we set the cardinality of the leaf to be used by # prepare_column_values and then set it back. Nasty. Create a property for leaf cardinality. predicate2_cardinality = column_def[1]['predicate']['single'] if column_def[0]['predicate']['single'] == False: column_def[1]['predicate']['single'] = False column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row, column_name) column_def[1]['predicate']['single'] = predicate2_cardinality vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()] if unicode(column_values[0]).lower() == 'none': add_values = set() sub_values = set(vivo_values) else: add_values = set(column_values) - set(vivo_values) sub_values = set(vivo_values) - set(column_values) logger.debug(u"Two step SET COMPARE\n\tRow {}\n\tColumn {}\n\tSource values {}\n\tVIVO values {}" + "\n\tAdd values {}\n\tSub values {}\n\tStep_uris {}". format(row, column_name, column_values, vivo_values, add_values, sub_values, step_uris)) # Process the adds if len(add_values) > 0: if column_def[0]['predicate']['single'] == False: # Multiple intermediaries, single valued-leaves for leaf_value in add_values: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {}) else: # Multiple values on the single leaf if len(step_uris) == 0: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) else: step_uri = step_uris[0] self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {}) # Process the subs if len(sub_values) > 0: if column_def[0]['predicate']['single'] == False: # Handle multiple intermediaries, single leaves, by removing each intermediary and all its # assertions for leaf_value in sub_values: step_uri = vivo_objs[unicode(leaf_value)][1] self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) else: # Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary # Then check to see if the intermediary has any remaining leaf assertions and remove if empty step_uri = vivo_objs[unicode(next(iter(sub_values)))][1] for leaf_value in sub_values: self.update_graph.remove((step_uri, None, leaf_value)) g = self.update_graph.triples((step_uri, column_def[1]['predicate']['ref'], None)) if g == set(): self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) return None
def __do_update(self): """ For each row, process each column. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested add, change, delete """ from rdflib import URIRef, RDF from vivopump import new_uri, prepare_column_values, get_step_triples, PathLengthException merges = {} for row, data_update in self.update_data.items(): # Create a URI if empty logger.debug("data_update[uri] = {}".format(data_update['uri'])) if data_update['uri'].strip() == '': dict_is_empty = True for item in data_update.values(): if len(item) != 0: dict_is_empty = False if dict_is_empty: # skip blank lines in the input file continue # If the source uri is empty, create one. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.query_parms) logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, uri_string)) uri = URIRef(uri_string) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) # Create a URI entity if not found else: uri = URIRef(data_update['uri'].strip()) if (uri, None, None) not in self.update_graph: logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, str(uri))) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) entity_uri = uri action = data_update.get('action', '').lower() # Process remove action if any if action == 'remove': self.__do_remove(row, uri) continue # Collect merge info if any if action != '': k = action.find('1') if k > -1: key = action[0:k] if key not in merges: merges[key] = {} merges[key]['primary'] = None merges[key]['secondary'] = [uri] else: merges[key]['secondary'].append(uri) else: if action not in merges: merges[action] = {} merges[action]['primary'] = uri if 'secondary' not in merges[action]: merges[action]['secondary'] = [] # For this row, process all the column_defs and then process closure defs if any. Closures allow # columns to be "reused" providing additional paths from the row entity to entities in the paths. for column_name, column_def in self.update_def['column_defs'].items() + \ self.update_def.get('closure_defs', {}).items(): if column_name not in data_update: continue # extra column names are allowed in the spreadsheet for annotation uri = entity_uri if data_update[column_name] == '': logger.debug(u"Skipping blank value. row {} column {}".format(row, column_name)) continue column_def_len = len(column_def) logger.debug("column_def length is: {}".format(column_def_len)) if column_def_len > 3: raise PathLengthException( "ERROR: Path lengths > 3 not supported. Path length for {} is {}" .format(column_name, column_def_len)) elif column_def_len == 3: self.__do_three_step_update(row, column_name, uri, column_def, data_update) elif column_def_len == 2: self.__do_two_step_update(row, column_name, uri, column_def, data_update) elif column_def_len == 1: step_def = column_def[0] vivo_objs = {unicode(o): o for s, p, o in get_step_triples(self.update_graph, uri, step_def, self.query_parms)} column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum, row, column_name) logger.debug(u"{} {} {} {} {}".format(row, column_name, column_values, uri, vivo_objs)) self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs) if any(merges): self.__do_merges(merges) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original logger.info(u"Triples to add\n{}".format(add.serialize(format='nt'))) sub = self.original_graph - self.update_graph # Triples in original that are not in update logger.info(u"Triples to sub\n{}".format(sub.serialize(format='nt'))) return [add, sub]