def remove_entitytypes_and_concepts(all_entitytypeids_to_remove, only_entitytypes=False): # if the entity_types are no longer associated to any resource graph, then delete the entity_types themselves and then proceed with pruning concepts if not isinstance(all_entitytypeids_to_remove, list): all_entitytypeids_to_remove = [all_entitytypeids_to_remove] for entity_to_remove in all_entitytypeids_to_remove: still_linked = False if not models.Mappings.objects.filter( entitytypeidto=entity_to_remove) else True if not still_linked: entity_types = models.EntityTypes.objects.filter( entitytypeid=entity_to_remove) #### Prune the concepts concepts_to_delete = [] for entity_type in entity_types: # Find the root concept concept = entity_type.conceptid # only add this for deletion if the concept isn't used by any other entitytypes relations = models.EntityTypes.objects.filter( conceptid=concept.pk) if len(relations) <= 1: concepts_to_delete.append(entity_type.conceptid) else: logging.warning( "Concept type for entity in use (perhaps because this node was mapped to a new one). Not deleting. %s", entity_type) # delete the entity types, and then their concepts entity_types.delete() for concept_model in concepts_to_delete: # remove it and all of its relations and their values logging.warning( "Removing concept and children/values/relationships for %s", concept_model.legacyoid) concept = Concept() concept.get(concept_model.pk, semantic=False, include_subconcepts=False) concept.delete(delete_self=True)
def remove_concept_list(concepts): with open(concepts, 'rb') as csvfile: try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) except csv.Error: print "The source data is not a CSV file" concept_list = csv.reader(csvfile, delimiter=',') print "There are", sum( 1 for line in open(concepts)), " concepts that will be deleted" concepts_to_delete = [] for c in concept_list: relations = models.EntityTypes.objects.filter(conceptid=c[0]) if len(relations) <= 1: concepts_to_delete.append(c[0]) else: logging.warning( "Concept type for entity in use (perhaps because this node was mapped to a new one). Not deleting. %s", c[0]) for concept_model in concepts_to_delete: # remove it and all of its relations and their values logging.warning( "Removing concept and children/values/relationships for %s", concept_model) concept = Concept() try: concept.get(concept_model, semantic=False, include_subconcepts=True) concept.delete(delete_self=True) concept_model.delete() except: print "Conceptid %s does not exist" % concept_model
def validate_values(settings=None): if not settings: from django.conf import settings # ensure that all domain values chosen for entities are of the correct concept # get all domains entries # MAX_TO_CHECK = 10000 MAX_TO_CHECK = False domains = models.Domains.objects.all().order_by('pk') if MAX_TO_CHECK: domains = domains[:10000] logging.warning("validating %s values", len(domains)) invalid_values = [] # for each domain entry for domain in domains: # Get the expected associated root concept via domain -> entity -> entity_type -> concept expected_concept = domain.entityid.entitytypeid.conceptid # Get the chosen concept via domain -> value -> concept selected_concept = domain.val.conceptid # Check that the chosen concept is related to the entity type's concept (may be indirect) direct_relations = models.ConceptRelations.objects.filter( (Q(conceptidfrom=expected_concept.pk) & Q(conceptidto=selected_concept.pk)) | Q(conceptidfrom=selected_concept.pk) & Q(conceptidto=expected_concept.pk)) if len(direct_relations) == 0: # the value isn't a direct child, but may be a grandchild or more distant relative. Check the whole tree of the parent concept to be sure. # logging.warning("chosen value not directly related to parent concept. Checking full concept tree") full_concept = Concept() full_concept.get(expected_concept.pk, include_subconcepts=True, semantic=False) all_related = full_concept.flatten() all_related_ids = [x.id for x in all_related] if selected_concept.pk not in all_related_ids: entry = (domain.entityid.pk, domain.entityid.entitytypeid.pk, expected_concept.legacyoid, domain.val.conceptid.legacyoid, domain.val.value) invalid_values.append(entry) logging.warning("chosen value not related to parent concept.") logging.warning(",".join(entry)) if len(invalid_values): print "invalid values were found. See logs/concept_value_errors.txt" utils.write_to_file( os.path.join(settings.PACKAGE_ROOT, 'logs', 'concept_value_errors.txt'), '') utils.write_to_file( os.path.join(settings.PACKAGE_ROOT, 'logs', 'concept_value_errors.txt'), 'entity id, entity type, expected parent concept (collection), selected value concept, value' ) invalid_values_strings = [','.join(v) for v in invalid_values] utils.write_to_file( os.path.join(settings.PACKAGE_ROOT, 'logs', 'concept_value_errors.txt'), '\n'.join(invalid_values_strings)) else: print "All values were found to be valid"
def load_authority_file(cursor, path_to_authority_files, filename, auth_file_to_entity_concept_mapping): print filename.upper() start = time() value_types = models.DValueType.objects.all() filepath = os.path.join(path_to_authority_files, filename) unicodecsv.field_size_limit(sys.maxint) errors = [] lookups = Lookups() #create nodes for each authority document file and relate them to the authority document node in the concept schema auth_doc_file_name = str(filename) display_file_name = string.capwords(auth_doc_file_name.replace('_',' ').replace('AUTHORITY DOCUMENT.csv', '').strip()) if auth_doc_file_name.upper() != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.CSV': top_concept = Concept() top_concept.id = str(uuid.uuid4()) top_concept.nodetype = 'Concept' top_concept.legacyoid = auth_doc_file_name top_concept.addvalue({'value':display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'}) lookups.add_relationship(source='00000000-0000-0000-0000-000000000001', type='hasTopConcept', target=top_concept.id) collector_concept = Concept() collector_concept.id = str(uuid.uuid4()) collector_concept.nodetype = 'Collection' collector_concept.legacyoid = auth_doc_file_name.split('.')[0] collector_concept.addvalue({'value':display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'}) collector_concept.save() lookups.add_relationship(source='00000000-0000-0000-0000-000000000003', type='hasCollection', target=collector_concept.id) else: top_concept = Concept().get(id = '00000000-0000-0000-0000-000000000005') top_concept.legacyoid = 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv' lookups.add_lookup(concept=top_concept, rownum=0) try: with open(filepath, 'rU') as f: rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','PREFLABEL','ALTLABELS','PARENTCONCEPTID','CONCEPTTYPE','PROVIDER'], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'MISSING' in row: raise Exception('The row wasn\'t parsed properly. Missing %s' % (row['MISSING'])) else: legacyoid = row[u'CONCEPTID'] concept = Concept() concept.id = legacyoid if is_uuid(legacyoid) == True else str(uuid.uuid4()) concept.nodetype = 'Concept'# if row[u'CONCEPTTYPE'].upper() == 'INDEX' else 'Collection' concept.legacyoid = row[u'CONCEPTID'] concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'}) if row['CONCEPTTYPE'].lower() == 'collector': concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'collector', 'category': 'label'}) if row[u'ALTLABELS'] != '': altlabel_list = row[u'ALTLABELS'].split(';') for altlabel in altlabel_list: concept.addvalue({'value':altlabel, 'language': settings.LANGUAGE_CODE, 'type': 'altLabel', 'category': 'label'}) parent_concept_id = lookups.get_lookup(legacyoid=row[u'PARENTCONCEPTID']).id lookups.add_relationship(source=parent_concept_id, type='narrower', target=concept.id, rownum=rows.line_num) # don't add a member relationship between a top concept and it's children # if parent_concept_id != top_concept.id: lookups.add_relationship(source=parent_concept_id, type='member', target=concept.id, rownum=rows.line_num) # add the member relationship from the authority document collector concept if row[u'PARENTCONCEPTID'] == auth_doc_file_name and auth_doc_file_name != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv': authdoc_concept = Concept() authdoc_concept.get(legacyoid=auth_doc_file_name.split('.')[0]) lookups.add_relationship(source=authdoc_concept.id, type='member', target=concept.id, rownum=rows.line_num) if row[u'PARENTCONCEPTID'] == '' or (row[u'CONCEPTTYPE'].upper() != 'INDEX' and row[u'CONCEPTTYPE'].upper() != 'COLLECTOR'): raise Exception('The row has invalid values.') lookups.add_lookup(concept=concept, rownum=rows.line_num) except Exception as e: errors.append('ERROR in row %s: %s' % (rows.line_num, str(e))) except UnicodeDecodeError as e: errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') try: # try and open the values file if it exists if exists(filepath.replace('.csv', '.values.csv')): with open(filepath.replace('.csv', '.values.csv'), 'rU') as f: rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','VALUE','VALUETYPE','PROVIDER'], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'ADDITIONAL' in row: raise Exception('The row wasn\'t parsed properly. Additional fields found %s. Add quotes to values that have commas in them.' % (row['ADDITIONAL'])) else: row_valuetype = row[u'VALUETYPE'].strip() if row_valuetype not in value_types.values_list('valuetype', flat=True): valuetype = models.DValueType() valuetype.valuetype = row_valuetype valuetype.category = 'undefined' valuetype.namespace = 'arches' valuetype.save() value_types = models.DValueType.objects.all() concept = lookups.get_lookup(legacyoid=row[u'CONCEPTID']) category = value_types.get(valuetype=row_valuetype).category concept.addvalue({'value':row[u'VALUE'], 'type': row[u'VALUETYPE'], 'category': category}) except Exception as e: errors.append('ERROR in row %s (%s): %s' % (rows.line_num, str(e), row)) except UnicodeDecodeError as e: errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename.replace('.csv', '.values.csv'))) errors.append('\n\n\n\n') # insert and index the concpets for key in lookups.lookup: try: lookups.lookup[key]['concept'].save() except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (lookups.lookup[key]['rownum'], str(e), traceback.format_exc())) lookups.lookup[key]['concept'].index(scheme=top_concept) # insert the concept relations for relation in lookups.concept_relationships: sql = """ INSERT INTO relations(relationid, conceptidfrom, conceptidto, relationtype) VALUES (public.uuid_generate_v1mc(), '%s', '%s', '%s'); """%(relation['source'], relation['target'], relation['type']) #print sql try: cursor.execute(sql) except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (relation['rownum'], str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') #print 'Time to parse = %s' % ("{0:.2f}".format(time() - start)) return errors
def update_reference_data(altered_node_list): # After the new part of ontology has been inserted. Update the reference data for moved nodes for node in altered_node_list: if node['OLDENTITYTYPEID'] == '': # ignore empty rows continue try: logging.warning("---") logging.warning("UPDATING REFERENCE DATA FOR %s", node['NEWENTITYTYPEID']) # get root concept for the old node old_entity_type = models.EntityTypes.objects.all().get( entitytypeid=node['OLDENTITYTYPEID']) new_entity_type = models.EntityTypes.objects.all().get( entitytypeid=node['NEWENTITYTYPEID']) # set as the root concept for the new node new_entity_type.conceptid = old_entity_type.conceptid new_entity_type.save() # traverse all related concepts, updating legacyOID for concepts which are EntityType or Collection try: root_concept = Concept() root_concept.get(new_entity_type.conceptid) # rewrite_concept(root_concept, new_entity_type.pk) root_concept.traverse(rewrite_concept, scope=new_entity_type.pk) except Exception as e: logging.warning( "\n\nUnable to update concepts for migration mapping: %s \n%s", node, e) except Exception as e: logging.warning( "\n\nUnable to update reference data for migration mapping: %s \n%s", node, e) # Now migrate the authority document concepts also try: logging.warning("Updating auth doc concept for %s", node) # find the old authority document entitytype corresponding to this entitytype old_authority_document_entitytype_rule = models.Rules.objects.get( entitytypedomain=node['OLDENTITYTYPEID'], propertyid='-P71') old_authority_document_entitytypeid = old_authority_document_entitytype_rule.entitytyperange new_authority_document_entitytype_rule = models.Rules.objects.get( entitytypedomain=node['NEWENTITYTYPEID'], propertyid='-P71') new_authority_document_entitytypeid = new_authority_document_entitytype_rule.entitytyperange old_authority_document_entitytype = models.EntityTypes.objects.get( entitytypeid=old_authority_document_entitytypeid) new_authority_document_entitytype = models.EntityTypes.objects.get( entitytypeid=new_authority_document_entitytypeid) # point the new authority document entitytype at the concept for the old one new_authority_document_entitytype.conceptid = old_authority_document_entitytype.conceptid new_authority_document_entitytype.save() # update the value top_concept_value = models.Values.objects.get( conceptid=old_authority_document_entitytype.conceptid, languageid='en-US') top_concept_value.value = node['NEWENTITYTYPEID'] top_concept_value.save() # remove the old stub concept created for the new auth doc node? except Exception as e: logging.warning("\n\nUnable to update Entity %s", e)
def load_authority_file(cursor, path_to_authority_files, filename, auth_file_to_entity_concept_mapping): print filename.upper() start = time() value_types = models.DValueType.objects.all() filepath = os.path.join(path_to_authority_files, filename) unicodecsv.field_size_limit(sys.maxint) errors = [] lookups = Lookups() #create nodes for each authority document file and relate them to the authority document node in the concept schema auth_doc_file_name = str(filename) display_file_name = string.capwords( auth_doc_file_name.replace('_', ' ').replace('AUTHORITY DOCUMENT.csv', '').strip()) if auth_doc_file_name.upper( ) != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.CSV': top_concept = Concept() top_concept.id = str(uuid.uuid4()) top_concept.nodetype = 'Concept' top_concept.legacyoid = auth_doc_file_name top_concept.addvalue({ 'value': display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label' }) lookups.add_relationship(source='00000000-0000-0000-0000-000000000001', type='hasTopConcept', target=top_concept.id) collector_concept = Concept() collector_concept.id = str(uuid.uuid4()) collector_concept.nodetype = 'Collection' collector_concept.legacyoid = auth_doc_file_name.split('.')[0] collector_concept.addvalue({ 'value': display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label' }) collector_concept.save() lookups.add_relationship(source='00000000-0000-0000-0000-000000000003', type='hasCollection', target=collector_concept.id) else: top_concept = Concept().get(id='00000000-0000-0000-0000-000000000005') top_concept.legacyoid = 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv' lookups.add_lookup(concept=top_concept, rownum=0) try: with open(filepath, 'rU') as f: rows = unicodecsv.DictReader(f, fieldnames=[ 'CONCEPTID', 'PREFLABEL', 'ALTLABELS', 'PARENTCONCEPTID', 'CONCEPTTYPE', 'PROVIDER' ], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'MISSING' in row: raise Exception( 'The row wasn\'t parsed properly. Missing %s' % (row['MISSING'])) else: legacyoid = row[u'CONCEPTID'] concept = Concept() concept.id = legacyoid if is_uuid( legacyoid) == True else str(uuid.uuid4()) concept.nodetype = 'Concept' # if row[u'CONCEPTTYPE'].upper() == 'INDEX' else 'Collection' concept.legacyoid = row[u'CONCEPTID'] concept.addvalue({ 'value': row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label' }) if row['CONCEPTTYPE'].lower() == 'collector': concept.addvalue({ 'value': row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'collector', 'category': 'label' }) if row[u'ALTLABELS'] != '': altlabel_list = row[u'ALTLABELS'].split(';') for altlabel in altlabel_list: concept.addvalue({ 'value': altlabel, 'language': settings.LANGUAGE_CODE, 'type': 'altLabel', 'category': 'label' }) parent_concept_id = lookups.get_lookup( legacyoid=row[u'PARENTCONCEPTID']).id lookups.add_relationship(source=parent_concept_id, type='narrower', target=concept.id, rownum=rows.line_num) # don't add a member relationship between a top concept and it's children # if parent_concept_id != top_concept.id: lookups.add_relationship(source=parent_concept_id, type='member', target=concept.id, rownum=rows.line_num) # add the member relationship from the authority document collector concept if row[u'PARENTCONCEPTID'] == auth_doc_file_name and auth_doc_file_name != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv': authdoc_concept = Concept() authdoc_concept.get( legacyoid=auth_doc_file_name.split('.')[0]) lookups.add_relationship(source=authdoc_concept.id, type='member', target=concept.id, rownum=rows.line_num) if row[u'PARENTCONCEPTID'] == '' or ( row[u'CONCEPTTYPE'].upper() != 'INDEX' and row[u'CONCEPTTYPE'].upper() != 'COLLECTOR'): raise Exception('The row has invalid values.') lookups.add_lookup(concept=concept, rownum=rows.line_num) except Exception as e: errors.append('ERROR in row %s: %s' % (rows.line_num, str(e))) except UnicodeDecodeError as e: errors.append( 'ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') try: # try and open the values file if it exists if exists(filepath.replace('.csv', '.values.csv')): with open(filepath.replace('.csv', '.values.csv'), 'rU') as f: rows = unicodecsv.DictReader( f, fieldnames=['CONCEPTID', 'VALUE', 'VALUETYPE', 'PROVIDER'], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'ADDITIONAL' in row: raise Exception( 'The row wasn\'t parsed properly. Additional fields found %s. Add quotes to values that have commas in them.' % (row['ADDITIONAL'])) else: row_valuetype = row[u'VALUETYPE'].strip() if row_valuetype not in value_types.values_list( 'valuetype', flat=True): valuetype = models.DValueType() valuetype.valuetype = row_valuetype valuetype.category = 'undefined' valuetype.namespace = 'arches' valuetype.save() value_types = models.DValueType.objects.all() concept = lookups.get_lookup( legacyoid=row[u'CONCEPTID']) category = value_types.get( valuetype=row_valuetype).category concept.addvalue({ 'value': row[u'VALUE'], 'type': row[u'VALUETYPE'], 'category': category }) except Exception as e: errors.append('ERROR in row %s (%s): %s' % (rows.line_num, str(e), row)) except UnicodeDecodeError as e: errors.append( 'ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert( 0, 'ERRORS IN FILE: %s\n' % (filename.replace('.csv', '.values.csv'))) errors.append('\n\n\n\n') # insert and index the concpets for key in lookups.lookup: try: lookups.lookup[key]['concept'].save() except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (lookups.lookup[key]['rownum'], str(e), traceback.format_exc())) lookups.lookup[key]['concept'].index(scheme=top_concept) # insert the concept relations for relation in lookups.concept_relationships: sql = """ INSERT INTO relations(relationid, conceptidfrom, conceptidto, relationtype) VALUES (public.uuid_generate_v1mc(), '%s', '%s', '%s'); """ % (relation['source'], relation['target'], relation['type']) #print sql try: cursor.execute(sql) except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (relation['rownum'], str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') #print 'Time to parse = %s' % ("{0:.2f}".format(time() - start)) return errors