Пример #1
0
def import_concepts(reference_data):
    concepts = reference_data[0]['concepts']
    values = reference_data[1]['values']
    relations = reference_data[2]['relations']

    concept_objs = {}
    for concept in concepts:
        concept_obj = Concept()
        concept_obj.id = concept['conceptid']
        concept_obj.nodetype = concept['nodetype']
        concept_obj.legacyoid = concept['legacyoid']
        concept_obj.save()

        concept_objs[concept_obj.id] = concept_obj

    existing_valuetypes = [o.valuetype for o in models.DValueType.objects.all()]
    for value in values:
        if value['valuetype'] not in existing_valuetypes:
            models.DValueType.objects.create(valuetype = value['valuetype'], category = 'undefined', namespace = 'arches')
            existing_valuetypes.append(value['valuetype'])

        conceptvalue_obj = ConceptValue()
        conceptvalue_obj.id = value['valueid']
        conceptvalue_obj.conceptid = value['conceptid']
        conceptvalue_obj.type = value['valuetype']
        conceptvalue_obj.value = value['value']
        conceptvalue_obj.language = value['languageid']
        conceptvalue_obj.save()

    for relation in relations:
        if relation['conceptidfrom'] in concept_objs and relation['conceptidto'] in concept_objs:
            conceptfrom = concept_objs[relation['conceptidfrom']]
            conceptto = concept_objs[relation['conceptidto']]
            conceptfrom.add_relation(conceptto, relation['relationtype'])
Пример #2
0
def import_concepts(reference_data):
    concepts = reference_data[0]['concepts']
    values = reference_data[1]['values']
    relations = reference_data[2]['relations']

    concept_objs = {}
    for concept in concepts:
        concept_obj = Concept()
        concept_obj.id = concept['conceptid']
        concept_obj.nodetype = concept['nodetype']
        concept_obj.legacyoid = concept['legacyoid']
        concept_obj.save()

        concept_objs[concept_obj.id] = concept_obj

    existing_valuetypes = [
        o.valuetype for o in models.DValueType.objects.all()
    ]
    for value in values:
        if value['valuetype'] not in existing_valuetypes:
            models.DValueType.objects.create(valuetype=value['valuetype'],
                                             category='undefined',
                                             namespace='arches')
            existing_valuetypes.append(value['valuetype'])

        conceptvalue_obj = ConceptValue()
        conceptvalue_obj.id = value['valueid']
        conceptvalue_obj.conceptid = value['conceptid']
        conceptvalue_obj.type = value['valuetype']
        conceptvalue_obj.value = value['value']
        conceptvalue_obj.language = value['languageid']
        conceptvalue_obj.save()

    for relation in relations:
        if relation['conceptidfrom'] in concept_objs and relation[
                'conceptidto'] in concept_objs:
            conceptfrom = concept_objs[relation['conceptidfrom']]
            conceptto = concept_objs[relation['conceptidto']]
            conceptfrom.add_relation(conceptto, relation['relationtype'])
Пример #3
0
def load_authority_file(cursor, path_to_authority_files, filename, auth_file_to_entity_concept_mapping):
    print filename.upper()    

    start = time()
    value_types = models.ValueTypes.objects.all()
    filepath = os.path.join(path_to_authority_files, filename)
    unicodecsv.field_size_limit(sys.maxint)
    errors = []
    lookups = Lookups()

    #create nodes for each authority document file and relate them to the authority document node in the concept schema
    auth_doc_file_name = str(filename)
    display_file_name = string.capwords(auth_doc_file_name.replace('_',' ').replace('AUTHORITY DOCUMENT.csv', '').strip())
    if auth_doc_file_name.upper() != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.CSV':
        top_concept = Concept()
        top_concept.id = str(uuid.uuid4())
        top_concept.nodetype = 'Concept'       
        top_concept.legacyoid = auth_doc_file_name
        top_concept.addvalue({'value':display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'})
        lookups.add_relationship(source='00000000-0000-0000-0000-000000000001', type='hasTopConcept', target=top_concept.id)

    else:
        top_concept = Concept().get(id = '00000000-0000-0000-0000-000000000005')
        top_concept.legacyoid = 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv'

    lookups.add_lookup(concept=top_concept, rownum=0)
    
    try:
        with open(filepath, 'rU') as f:
            rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','PREFLABEL','ALTLABELS','PARENTCONCEPTID','CONCEPTTYPE','PROVIDER'], 
                encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING')
            rows.next() # skip header row
            for row in rows:              
                try:
                    if 'MISSING' in row:
                        raise Exception('The row wasn\'t parsed properly. Missing %s' % (row['MISSING']))
                    else:
                        legacyoid = row[u'CONCEPTID']
                        concept = Concept()
                        concept.id = legacyoid if is_uuid(legacyoid) == True else str(uuid.uuid4())
                        concept.nodetype = 'Concept'# if row[u'CONCEPTTYPE'].upper() == 'INDEX' else 'Collection'
                        concept.legacyoid = row[u'CONCEPTID']
                        concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'})
                        if row['CONCEPTTYPE'].lower() == 'collector':
                            concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'collector', 'category': 'label'})
                        if row[u'ALTLABELS'] != '':
                            altlabel_list = row[u'ALTLABELS'].split(';')
                            for altlabel in altlabel_list:
                                concept.addvalue({'value':altlabel, 'language': settings.LANGUAGE_CODE, 'type': 'altLabel', 'category': 'label'})    
                        
                        parent_concept_id = lookups.get_lookup(legacyoid=row[u'PARENTCONCEPTID']).id
                        lookups.add_relationship(source=parent_concept_id, type='narrower', target=concept.id, rownum=rows.line_num)
                        # don't add a member relationship between a top concept and it's children
                        if parent_concept_id != top_concept.id: 
                            lookups.add_relationship(source=parent_concept_id, type='member', target=concept.id, rownum=rows.line_num)
                        
                        # add the member relationship from the E55 type (typically) to their top members
                        if auth_doc_file_name in auth_file_to_entity_concept_mapping and row[u'PARENTCONCEPTID'] == auth_doc_file_name:
                            for entitytype_info in auth_file_to_entity_concept_mapping[auth_doc_file_name]:
                                lookups.add_relationship(source=entitytype_info['ENTITYTYPE_CONCEPTID'], type='member', target=concept.id, rownum=rows.line_num)

                        if row[u'PARENTCONCEPTID'] == '' or (row[u'CONCEPTTYPE'].upper() != 'INDEX' and row[u'CONCEPTTYPE'].upper() != 'COLLECTOR'):
                            raise Exception('The row has invalid values.')

                        lookups.add_lookup(concept=concept, rownum=rows.line_num)    
                        
                except Exception as e:
                    errors.append('ERROR in row %s: %s' % (rows.line_num, str(e)))           
    
    except UnicodeDecodeError as e:
        errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc()))
    except Exception as e:
        errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc()))
    
    if len(errors) > 0:
        errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename))
        errors.append('\n\n\n\n')

    try:
        # try and open the values file if it exists
        if exists(filepath.replace('.csv', '.values.csv')):
            with open(filepath.replace('.csv', '.values.csv'), 'rU') as f:
                rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','VALUE','VALUETYPE','PROVIDER'], 
                    encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING')
                rows.next() # skip header row
                for row in rows:
                    try:
                        if 'ADDITIONAL' in row:
                            raise Exception('The row wasn\'t parsed properly. Additional fields found %s.  Add quotes to values that have commas in them.' % (row['ADDITIONAL']))
                        else:
                            row_valuetype = row[u'VALUETYPE'].strip()
                            if row_valuetype not in value_types.values_list('valuetype', flat=True): 
                                valuetype = models.ValueTypes()
                                valuetype.valuetype = row_valuetype
                                valuetype.category = 'undefined'
                                valuetype.namespace = 'arches'
                                valuetype.save()
                            
                            value_types = models.ValueTypes.objects.all()
                            concept = lookups.get_lookup(legacyoid=row[u'CONCEPTID'])
                            category = value_types.get(valuetype=row_valuetype).category
                            concept.addvalue({'value':row[u'VALUE'], 'type': row[u'VALUETYPE'], 'category': category})

                    except Exception as e:
                        errors.append('ERROR in row %s (%s): %s' % (rows.line_num, str(e), row))
    
    except UnicodeDecodeError as e:
        errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc()))
    except Exception as e:
        errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc()))            
        
    if len(errors) > 0:
        errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename.replace('.csv', '.values.csv')))
        errors.append('\n\n\n\n')


    # insert and index the concpets
    for key in lookups.lookup:
        try:
            lookups.lookup[key]['concept'].save()
        except Exception as e:
            errors.append('ERROR in row %s (%s):\n%s\n' % (lookups.lookup[key]['rownum'], str(e), traceback.format_exc()))
        
        lookups.lookup[key]['concept'].index(scheme=top_concept)            

    # insert the concept relations
    for relation in lookups.concept_relationships:
        sql = """
            INSERT INTO concepts.relations(conceptidfrom, conceptidto, relationtype)
            VALUES ('%s', '%s', '%s');
        """%(relation['source'], relation['target'], relation['type'])
        #print sql
        try:
            cursor.execute(sql)
        except Exception as e:
            errors.append('ERROR in row %s (%s):\n%s\n' % (relation['rownum'], str(e), traceback.format_exc()))
    
    if len(errors) > 0:
        errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename))
        errors.append('\n\n\n\n')

    #print 'Time to parse = %s' % ("{0:.2f}".format(time() - start))    

    return errors