def save_resource(self, populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count): # create a resource instance only if there are populated_tiles errors = [] if len(populated_tiles) > 0: newresourceinstance = Resource( resourceinstanceid=resourceinstanceid, graph_id=target_resource_model, legacyid=legacyid, createdtime=datetime.datetime.now()) # add the tiles to the resource instance newresourceinstance.tiles = populated_tiles # if bulk saving then append the resources to a list otherwise just save the resource if bulk: resources.append(newresourceinstance) if len(resources) >= settings.BULK_IMPORT_BATCH_SIZE: Resource.bulk_save(resources=resources) del resources[:] #clear out the array else: newresourceinstance.save() else: errors.append({ 'type': 'WARNING', 'message': 'No resource created for legacyid: {0}. Make sure there is data to be imported for this resource and it is mapped properly in your mapping file.' .format(legacyid) }) if len(errors) > 0: self.errors += errors if save_count % (settings.BULK_IMPORT_BATCH_SIZE / 4) == 0: print '%s resources processed' % str(save_count)
def save_resource(self, populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count): # create a resource instance only if there are populated_tiles errors = [] if len(populated_tiles) > 0: newresourceinstance = Resource( resourceinstanceid=resourceinstanceid, graph_id=target_resource_model, legacyid=legacyid, createdtime=datetime.datetime.now() ) # add the tiles to the resource instance newresourceinstance.tiles = populated_tiles # if bulk saving then append the resources to a list otherwise just save the resource if bulk: resources.append(newresourceinstance) if len(resources) == settings.BULK_IMPORT_BATCH_SIZE: Resource.bulk_save(resources=resources) del resources[:] #clear out the array else: newresourceinstance.save() else: errors.append({'type': 'WARNING', 'message': 'No resource created for legacyid: {0}. Make sure there is data to be imported for this resource and it is mapped properly in your mapping file.'.format(legacyid)}) if len(errors) > 0: self.errors += errors if save_count % (settings.BULK_IMPORT_BATCH_SIZE/4) == 0: print '%s resources processed' % str(save_count)
def save_resource(self, populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count): # create a resource instance only if there are populated_tiles errors = [] if len(populated_tiles) > 0: newresourceinstance = Resource( resourceinstanceid=resourceinstanceid, graph_id=target_resource_model, legacyid=legacyid, createdtime=datetime.datetime.now() ) # add the tiles to the resource instance newresourceinstance.tiles = populated_tiles # if bulk saving then append the resources to a list otherwise just save the resource if bulk: resources.append(newresourceinstance) if len(resources) >= settings.BULK_IMPORT_BATCH_SIZE: Resource.bulk_save(resources=resources) del resources[:] #clear out the array else: try: newresourceinstance.save() except TransportError as e: cause = json.dumps(e.info['error']['caused_by'],indent=1) msg = '%s: WARNING: failed to index document in resource: %s. Exception detail:\n%s\n' % (datetime.datetime.now(), resourceinstanceid, cause) errors.append({'type': 'WARNING', 'message': msg}) newresourceinstance.delete() save_count=save_count-1 except Exception as e: msg = '%s: WARNING: failed to index document in resource: %s. Exception detail:\n%s\n' % (datetime.datetime.now(), resourceinstanceid, e) errors.append({'type': 'WARNING', 'message': msg}) newresourceinstance.delete() save_count=save_count-1 else: errors.append({'type': 'WARNING', 'message': 'No resource created for legacyid: {0}. Make sure there is data to be imported for this resource and it is mapped properly in your mapping file.'.format(legacyid)}) if len(errors) > 0: self.errors += errors if save_count % (settings.BULK_IMPORT_BATCH_SIZE/4) == 0: print '%s resources processed' % str(save_count)
def save_resource(self, populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count, row_number): # create a resource instance only if there are populated_tiles errors = [] if len(populated_tiles) > 0: newresourceinstance = Resource( resourceinstanceid=resourceinstanceid, graph_id=target_resource_model, legacyid=legacyid, createdtime=datetime.datetime.now() ) # add the tiles to the resource instance newresourceinstance.tiles = populated_tiles # if bulk saving then append the resources to a list otherwise just save the resource if bulk: resources.append(newresourceinstance) if len(resources) >= settings.BULK_IMPORT_BATCH_SIZE: Resource.bulk_save(resources=resources) del resources[:] #clear out the array else: try: newresourceinstance.save() except TransportError as e: cause = json.dumps(e.info['error']['caused_by'],indent=1) msg = '%s: WARNING: failed to index document in resource: %s %s. Exception detail:\n%s\n' % (datetime.datetime.now(), resourceinstanceid, row_number, cause) errors.append({'type': 'WARNING', 'message': msg}) newresourceinstance.delete() save_count=save_count-1 except Exception as e: msg = '%s: WARNING: failed to index document in resource: %s %s. Exception detail:\n%s\n' % (datetime.datetime.now(), resourceinstanceid, row_number, e) errors.append({'type': 'WARNING', 'message': msg}) newresourceinstance.delete() save_count=save_count-1 else: errors.append({'type': 'WARNING', 'message': 'No resource created for legacyid: {0}. Make sure there is data to be imported for this resource and it is mapped properly in your mapping file.'.format(legacyid)}) if len(errors) > 0: self.errors += errors if save_count % (settings.BULK_IMPORT_BATCH_SIZE/4) == 0: print '%s resources processed' % str(save_count)
def import_business_data(self, business_data=None, mapping=None, overwrite='append', bulk=False, create_concepts=False, create_collections=False): # errors = businessDataValidator(self.business_data) def get_display_nodes(graphid): display_nodeids = [] functions = FunctionXGraph.objects.filter(function_id='60000000-0000-0000-0000-000000000001', graph_id=graphid) for function in functions: f = function.config del f['triggering_nodegroups'] for k,v in f.iteritems(): v['node_ids'] = [] v['string_template'] = v['string_template'].replace('<', '').replace('>', '').split(', ') if v['nodegroup_id'] != '': nodes = Node.objects.filter(nodegroup_id=v['nodegroup_id']) for node in nodes: if node.name in v['string_template']: display_nodeids.append(str(node.nodeid)) for k,v in f.iteritems(): if v['string_template'] != ['']: print 'The {0} {1} in the {2} display function.'.format(', '.join(v['string_template']), 'nodes participate' if len(v['string_template']) > 1 else 'node participates', k) else: print 'No nodes participate in the {0} display function.'.format(k) return display_nodeids def process_resourceid(resourceid, overwrite): # Test if resourceid is a UUID. try: resourceinstanceid = uuid.UUID(resourceid) # If resourceid is a UUID check if it is already an arches resource. try: ret = Resource.objects.filter(resourceinstanceid=resourceid) # If resourceid is an arches resource and overwrite is true, delete the existing arches resource. if overwrite == 'overwrite': Resource.objects.get(pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = resourceinstanceid # If resourceid is not a UUID create one. except: resourceinstanceid = resourceinstanceid except: # Get resources with the given legacyid ret = Resource.objects.filter(legacyid=resourceid) # If more than one resource is returned than make resource = None. This should never actually happen. if len(ret) > 1: resourceinstanceid = None # If no resource is returned with the given legacyid then create an archesid for the resource. elif len(ret) == 0: resourceinstanceid = uuid.uuid4() # If a resource is returned with the give legacyid then return its archesid else: if overwrite == 'overwrite': Resource.objects.get(pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = ret[0].resourceinstanceid return resourceinstanceid try: with transaction.atomic(): save_count = 0 try: resourceinstanceid = process_resourceid(business_data[0]['ResourceID'], overwrite) except KeyError: print '*'*80 print 'ERROR: No column \'ResourceID\' found in business data file. Please add a \'ResourceID\' column with a unique resource identifier.' print '*'*80 sys.exit() blanktilecache = {} populated_nodegroups = {} populated_nodegroups[resourceinstanceid] = [] previous_row_resourceid = None populated_tiles = [] target_resource_model = None single_cardinality_nodegroups = [str(nodegroupid) for nodegroupid in NodeGroup.objects.values_list('nodegroupid', flat=True).filter(cardinality = '1')] node_datatypes = {str(nodeid): datatype for nodeid, datatype in Node.objects.values_list('nodeid', 'datatype').filter(~Q(datatype='semantic'), graph__isresource=True)} display_nodes = get_display_nodes(mapping['resource_model_id']) all_nodes = Node.objects.all() datatype_factory = DataTypeFactory() concepts_to_create = {} new_concepts = {} required_nodes = {} for node in Node.objects.filter(isrequired=True, graph_id=mapping['resource_model_id']).values_list('nodeid', 'name'): required_nodes[str(node[0])] = node[1] # This code can probably be moved into it's own module. resourceids = [] non_contiguous_resource_ids = [] previous_row_for_validation = None for row_number, row in enumerate(business_data): # Check contiguousness of csv file. if row['ResourceID'] != previous_row_for_validation and row['ResourceID'] in resourceids: non_contiguous_resource_ids.append(row['ResourceID']) else: resourceids.append(row['ResourceID']) previous_row_for_validation = row['ResourceID'] if create_concepts == True: for node in mapping['nodes']: if node['data_type'] in ['concept', 'concept-list', 'domain-value', 'domain-value-list'] and node['file_field_name'] in row.keys(): # print row[node['file_field_name']] concept = [] for val in csv.reader([row[node['file_field_name']]], delimiter=',', quotechar='"'): concept.append(val) concept = concept[0] # check if collection is in concepts_to_create, add collection to concepts_to_create if it's not and add first child concept if node['arches_nodeid'] not in concepts_to_create: concepts_to_create[node['arches_nodeid']] = {} for concept_value in concept: concepts_to_create[node['arches_nodeid']][str(uuid.uuid4())] = concept_value # if collection in concepts to create then add child concept to collection elif row[node['file_field_name']] not in concepts_to_create[node['arches_nodeid']].values(): for concept_value in concept: concepts_to_create[node['arches_nodeid']][str(uuid.uuid4())] = concept_value if len(non_contiguous_resource_ids) > 0: print '*'*80 for non_contiguous_resource_id in non_contiguous_resource_ids: print 'ResourceID: ' + non_contiguous_resource_id print 'ERROR: The preceding ResourceIDs are non-contiguous in your csv file. Please sort your csv file by ResourceID and try import again.' print '*'*80 sys.exit() def create_reference_data(new_concepts, create_collections): errors = [] candidates = Concept().get(id='00000000-0000-0000-0000-000000000006') for arches_nodeid, concepts in new_concepts.iteritems(): collectionid = str(uuid.uuid4()) topconceptid = str(uuid.uuid4()) node = Node.objects.get(nodeid=arches_nodeid) # if node.datatype is concept or concept-list create concepts and collections if node.datatype in ['concept', 'concept-list']: # create collection if create_collections = create, otherwise append to collection already assigned to node if create_collections == True: collection_legacyoid = node.name + '_' + str(node.graph_id) + '_import' # check to see that there is not already a collection for this node if node.config['rdmCollection'] != None: errors.append({'type': 'WARNING', 'message': 'A collection already exists for the {0} node. Use the add option to add concepts to this collection.'.format(node.name)}) if len(errors) > 0: self.errors += errors collection = None else: # if there is no collection assigned to this node, create one and assign it to the node try: # check to see that a collection with this legacyid does not already exist collection = Concept().get(legacyoid=collection_legacyoid) errors.append({'type': 'WARNING', 'message': 'A collection with the legacyid {0} already exists.'.format(node.name + '_' + str(node.graph_id) + '_import')}) if len(errors) > 0: self.errors += errors except: collection = Concept({ 'id': collectionid, 'legacyoid': collection_legacyoid, 'nodetype': 'Collection' }) collection.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) node.config['rdmCollection'] = collectionid node.save() collection.save() else: # if create collection = add check that there is a collection associated with node, if no collection associated with node create a collection and associated with the node try: collection = Concept().get(id=node.config['rdmCollection']) except: collection = Concept({ 'id': collectionid, 'legacyoid': node.name + '_' + str(node.graph_id) + '_import', 'nodetype': 'Collection' }) collection.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) node.config['rdmCollection'] = collectionid node.save() collection.save() if collection != None: topconcept_legacyoid = node.name + '_' + str(node.graph_id) # Check if top concept already exists, if not create it and add to candidates scheme try: topconcept = Concept().get(legacyoid=topconcept_legacyoid) except: topconcept = Concept({ 'id': topconceptid, 'legacyoid': topconcept_legacyoid, 'nodetype': 'Concept' }) topconcept.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) topconcept.save() candidates.add_relation(topconcept, 'narrower') # create child concepts and relate to top concept and collection accordingly for conceptid, value in concepts.iteritems(): concept_legacyoid = value + '_' + node.name + '_' + str(node.graph_id) # check if concept already exists, if not create and add to topconcept and collection try: conceptid = [concept for concept in topconcept.get_child_concepts(topconcept.id) if concept[1] == value][0][0] concept = Concept().get(id=conceptid) except: concept = Concept({ 'id': conceptid, 'legacyoid': concept_legacyoid, 'nodetype': 'Concept' }) concept.addvalue({'id': str(uuid.uuid4()), 'value': value, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) concept.save() collection.add_relation(concept, 'member') topconcept.add_relation(concept, 'narrower') #if node.datatype is domain or domain-list create options array in node.config elif node.datatype in ['domain-value', 'domain-value-list']: for domainid, value in new_concepts[arches_nodeid].iteritems(): # check if value already exists in domain if value not in [t['text'] for t in node.config['options']]: domainvalue = { "text": value, "selected": False, "id": domainid } node.config['options'].append(domainvalue) node.save() if create_concepts == True: create_reference_data(concepts_to_create, create_collections) # if concepts are created on import concept_lookup must be instatiated afterward concept_lookup = ConceptLookup() def cache(blank_tile): if blank_tile.data != {}: for key in blank_tile.data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile else: for nodegroup, tile in blank_tile.tiles.iteritems(): for key in tile[0].data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile def column_names_to_targetids(row, mapping, row_number): errors = [] new_row = [] if 'ADDITIONAL' in row or 'MISSING' in row: errors.append({'type': 'WARNING', 'message': 'No resource created for ResourceID {0}. Line {1} has additional or missing columns.'.format(row['ResourceID'], str(int(row_number.split('on line ')[1])))}) if len(errors) > 0: self.errors += errors for key, value in row.iteritems(): if value != '': for row in mapping['nodes']: if key.upper() == row['file_field_name'].upper(): new_row.append({row['arches_nodeid']: value}) return new_row def transform_value(datatype, value, source, nodeid): ''' Transforms values from probably string/wkt representation to specified datatype in arches. This code could probably move to somehwere where it can be accessed by other importers. ''' request = '' if datatype != '': errors = [] datatype_instance = datatype_factory.get_instance(datatype) if datatype in ['concept', 'domain-value', 'concept-list', 'domain-value-list']: try: uuid.UUID(value) except: if datatype in ['domain-value', 'domain-value-list']: collection_id = nodeid else: collection_id = Node.objects.get(nodeid=nodeid).config['rdmCollection'] if collection_id != None: value = concept_lookup.lookup_labelid_from_label(value, collection_id) try: value = datatype_instance.transform_import_values(value, nodeid) errors = datatype_instance.validate(value, row_number, source) except Exception as e: errors.append({'type': 'ERROR', 'message': 'datatype: {0} value: {1} {2} - {3}'.format(datatype_instance.datatype_model.classname, value, source, str(e) + ' or is not a prefLabel in the given collection.')}) if len(errors) > 0: error_types = [error['type'] for error in errors] if 'ERROR' in error_types: value = None self.errors += errors else: print _('No datatype detected for {0}'.format(value)) return {'value': value, 'request': request} def get_blank_tile(source_data): if len(source_data) > 0: if source_data[0] != {}: key = str(source_data[0].keys()[0]) if key not in blanktilecache: blank_tile = Tile.get_blank_tile(key) cache(blank_tile) else: blank_tile = blanktilecache[key] else: blank_tile = None else: blank_tile = None # return deepcopy(blank_tile) return cPickle.loads(cPickle.dumps(blank_tile, -1)) def check_required_nodes(tile, parent_tile, required_nodes, all_nodes): # Check that each required node in a tile is populated. errors = [] if len(required_nodes) > 0: if bool(tile.data): for target_k, target_v in tile.data.iteritems(): if target_k in required_nodes.keys() and target_v is None: populated_tiles.pop(populated_tiles.index(parent_tile)) errors.append({'type': 'WARNING', 'message': 'The {0} node is required and must be populated in order to populate the {1} nodes. This data was not imported.'.format(required_nodes[target_k], ', '.join(all_nodes.filter(nodegroup_id=str(target_tile.nodegroup_id)).values_list('name', flat=True)))}) elif bool(tile.tiles): for tile_k, tile_v in tile.tiles.iteritems(): if len(tile_v) > 0: for t in tile_v: check_required_nodes(t, parent_tile, required_nodes, all_nodes) if len(errors) > 0: self.errors += errors resources = [] for row_number, row in enumerate(business_data): row_number = 'on line ' + unicode(row_number + 2) #to represent the row in a csv accounting for the header and 0 index if row['ResourceID'] != previous_row_resourceid and previous_row_resourceid is not None: save_count = save_count + 1 self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) # reset values for next resource instance populated_tiles = [] resourceinstanceid = process_resourceid(row['ResourceID'], overwrite) populated_nodegroups[resourceinstanceid] = [] source_data = column_names_to_targetids(row, mapping, row_number) missing_display_nodes = [n for n in display_nodes if n not in [list(b) for b in zip(*[a.keys() for a in source_data])][0]] if len(missing_display_nodes) > 0: errors = [] for mdn in missing_display_nodes: mdn_name = all_nodes.filter(nodeid=mdn).values_list('name', flat=True)[0] errors.append({'type': 'WARNING', 'message': '{0} {1} is null or not mapped and participates in a {2} display value function.'.format(mdn_name, row_number, mapping['resource_model_name'])}) if len(errors) > 0: self.errors += errors if len(source_data) > 0: if source_data[0].keys(): try: target_resource_model = all_nodes.get(nodeid=source_data[0].keys()[0]).graph_id except: print '*'*80 print 'ERROR: No resource model found. Please make sure the resource model this business data is mapped to has been imported into Arches.' print '*'*80 sys.exit() target_tile = get_blank_tile(source_data) if 'TileID' in row and row['TileID'] is not None: target_tile.tileid = row['TileID'] if 'NodeGroupID' in row and row['NodeGroupID'] is not None: target_tile.nodegroupid = row['NodeGroupID'] def populate_tile(source_data, target_tile): ''' source_data = [{nodeid:value},{nodeid:value},{nodeid:value} . . .] All nodes in source_data belong to the same resource. A dictionary of nodeids would not allow for multiple values for the same nodeid. Grouping is enforced by having all grouped attributes in the same row. ''' need_new_tile = False # Set target tileid to None because this will be a new tile, a new tileid will be created on save. target_tile.tileid = uuid.uuid4() if 'TileID' in row and row['TileID'] is not None: target_tile.tileid = row['TileID'] target_tile.resourceinstance_id = resourceinstanceid # Check the cardinality of the tile and check if it has been populated. # If cardinality is one and the tile is populated the tile should not be populated again. if str(target_tile.nodegroup_id) in single_cardinality_nodegroups and 'TileiD' not in row: target_tile_cardinality = '1' else: target_tile_cardinality = 'n' if str(target_tile.nodegroup_id) not in populated_nodegroups[resourceinstanceid]: # Check if we are populating a parent tile by inspecting the target_tile.data array. if target_tile.data != {}: # Iterate through the target_tile nodes and begin populating by iterating througth source_data array. # The idea is to populate as much of the target_tile as possible, before moving on to the next target_tile. for target_key in target_tile.data.keys(): for source_tile in source_data: for source_key in source_tile.keys(): # Check for source and target key match. if source_key == target_key: if target_tile.data[source_key] == None: # If match populate target_tile node with transformed value. value = transform_value(node_datatypes[source_key], source_tile[source_key], row_number, source_key) target_tile.data[source_key] = value['value'] # target_tile.request = value['request'] # Delete key from source_tile so we do not populate another tile based on the same data. del source_tile[source_key] # Cleanup source_data array to remove source_tiles that are now '{}' from the code above. source_data[:] = [item for item in source_data if item != {}] # Check if we are populating a child tile(s) by inspecting the target_tiles.tiles array. elif target_tile.tiles != None: populated_child_nodegroups = [] for nodegroupid, childtile in target_tile.tiles.iteritems(): prototype_tile = childtile.pop() if str(prototype_tile.nodegroup_id) in single_cardinality_nodegroups: child_tile_cardinality = '1' else: child_tile_cardinality = 'n' def populate_child_tiles(source_data): prototype_tile_copy = cPickle.loads(cPickle.dumps(prototype_tile, -1)) tileid = row['TileID'] if 'TileID' in row else uuid.uuid4() prototype_tile_copy.tileid = tileid prototype_tile_copy.parenttile = target_tile parenttileid = row['ParentTileID'] if 'ParentTileID' in row and row['ParentTileID'] is not None else None if parenttileid is not None: prototype_tile_copy.parenttile.tileid = parenttileid prototype_tile_copy.resourceinstance_id = resourceinstanceid if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: for target_key in prototype_tile_copy.data.keys(): for source_column in source_data: for source_key in source_column.keys(): if source_key == target_key: if prototype_tile_copy.data[source_key] == None: value = transform_value(node_datatypes[source_key], source_column[source_key], row_number, source_key) prototype_tile_copy.data[source_key] = value['value'] # print prototype_tile_copy.data[source_key] # print '&'*80 # target_tile.request = value['request'] del source_column[source_key] else: populate_child_tiles(source_data) if prototype_tile_copy.data != {}: if len([item for item in prototype_tile_copy.data.values() if item != None]) > 0: if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: childtile.append(prototype_tile_copy) if prototype_tile_copy != None: if child_tile_cardinality == '1' and 'NodeGroupID' not in row: populated_child_nodegroups.append(str(prototype_tile_copy.nodegroup_id)) source_data[:] = [item for item in source_data if item != {}] populate_child_tiles(source_data) if not target_tile.is_blank(): populated_tiles.append(target_tile) if len(source_data)>0: need_new_tile = True if target_tile_cardinality == '1' and 'NodeGroupID' not in row: populated_nodegroups[resourceinstanceid].append(str(target_tile.nodegroup_id)) if need_new_tile: new_tile = get_blank_tile(source_data) if new_tile != None: populate_tile(source_data, new_tile) # mock_request_object = HttpRequest() if target_tile != None and len(source_data) > 0: populate_tile(source_data, target_tile) # Check that required nodes are populated. If not remove tile from populated_tiles array. check_required_nodes(target_tile, target_tile, required_nodes, all_nodes) previous_row_resourceid = row['ResourceID'] legacyid = row['ResourceID'] if 'legacyid' in locals(): self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) if bulk: Resource.bulk_save(resources=resources) print _('%s total resource saved' % (save_count + 1)) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print message finally: pass
def import_business_data(self, business_data=None, mapping=None, overwrite='append', bulk=False): # errors = businessDataValidator(self.business_data) def process_resourceid(resourceid, overwrite): # Test if resourceid is a UUID. try: resourceinstanceid = uuid.UUID(resourceid) # If resourceid is a UUID check if it is already an arches resource. try: ret = Resource.objects.filter( resourceinstanceid=resourceid) # If resourceid is an arches resource and overwrite is true, delete the existing arches resource. if overwrite == 'overwrite': Resource.objects.get( pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = resourceinstanceid # If resourceid is not a UUID create one. except: resourceinstanceid = resourceinstanceid except: # Get resources with the given legacyid ret = Resource.objects.filter(legacyid=resourceid) # If more than one resource is returned than make resource = None. This should never actually happen. if len(ret) > 1: resourceinstanceid = None # If no resource is returned with the given legacyid then create an archesid for the resource. elif len(ret) == 0: resourceinstanceid = uuid.uuid4() # If a resource is returned with the give legacyid then return its archesid else: if overwrite == 'overwrite': Resource.objects.get( pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = ret[0].resourceinstanceid return resourceinstanceid try: with transaction.atomic(): save_count = 0 try: resourceinstanceid = process_resourceid( business_data[0]['ResourceID'], overwrite) except KeyError: print '*' * 80 print 'ERROR: No column \'ResourceID\' found in business data file. Please add a \'ResourceID\' column with a unique resource identifier.' print '*' * 80 sys.exit() blanktilecache = {} populated_nodegroups = {} populated_nodegroups[resourceinstanceid] = [] previous_row_resourceid = None populated_tiles = [] target_resource_model = None single_cardinality_nodegroups = [ str(nodegroupid) for nodegroupid in NodeGroup.objects.values_list( 'nodegroupid', flat=True).filter(cardinality='1') ] node_datatypes = { str(nodeid): datatype for nodeid, datatype in Node.objects.values_list( 'nodeid', 'datatype').filter(~Q(datatype='semantic'), graph__isresource=True) } all_nodes = Node.objects.all() datatype_factory = DataTypeFactory() concept_lookup = ConceptLookup() new_concepts = {} required_nodes = {} for node in Node.objects.filter(isrequired=True).values_list( 'nodeid', 'name'): required_nodes[str(node[0])] = node[1] # This code can probably be moved into it's own module. resourceids = [] non_contiguous_resource_ids = [] previous_row_for_validation = None for row_number, row in enumerate(business_data): # Check contiguousness of csv file. if row['ResourceID'] != previous_row_for_validation and row[ 'ResourceID'] in resourceids: non_contiguous_resource_ids.append(row['ResourceID']) else: resourceids.append(row['ResourceID']) previous_row_for_validation = row['ResourceID'] if len(non_contiguous_resource_ids) > 0: print '*' * 80 for non_contiguous_resource_id in non_contiguous_resource_ids: print 'ResourceID: ' + non_contiguous_resource_id print 'ERROR: The preceding ResourceIDs are non-contiguous in your csv file. Please sort your csv file by ResourceID and try import again.' print '*' * 80 sys.exit() def cache(blank_tile): if blank_tile.data != {}: for key in blank_tile.data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile else: for nodegroup, tile in blank_tile.tiles.iteritems(): for key in tile[0].data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile def column_names_to_targetids(row, mapping, row_number): errors = [] new_row = [] if 'ADDITIONAL' in row or 'MISSING' in row: errors.append({ 'type': 'WARNING', 'message': 'No resource created for ResourceID {0}. Line {1} has additional or missing columns.' .format(row['ResourceID'], str(int(row_number.split('on line ')[1]))) }) if len(errors) > 0: self.errors += errors for key, value in row.iteritems(): if value != '': for row in mapping['nodes']: if key.upper() == row['file_field_name'].upper( ): new_row.append( {row['arches_nodeid']: value}) return new_row def transform_value(datatype, value, source, nodeid): ''' Transforms values from probably string/wkt representation to specified datatype in arches. This code could probably move to somehwere where it can be accessed by other importers. ''' request = '' if datatype != '': errors = [] datatype_instance = datatype_factory.get_instance( datatype) if datatype in [ 'concept', 'domain-value', 'concept-list', 'domain-value-list' ]: try: uuid.UUID(value) except: if datatype in [ 'domain-value', 'domain-value-list' ]: collection_id = nodeid else: collection_id = Node.objects.get( nodeid=nodeid).config['rdmCollection'] if collection_id != None: value = concept_lookup.lookup_labelid_from_label( value, collection_id) try: value = datatype_instance.transform_import_values( value, nodeid) errors = datatype_instance.validate(value, source) except Exception as e: errors.append({ 'type': 'ERROR', 'message': 'datatype: {0} value: {1} {2} - {3}'.format( datatype_instance.datatype_model.classname, value, source, e) }) if len(errors) > 0: value = None self.errors += errors else: print _('No datatype detected for {0}'.format(value)) return {'value': value, 'request': request} def get_blank_tile(source_data): if len(source_data) > 0: if source_data[0] != {}: key = str(source_data[0].keys()[0]) if key not in blanktilecache: blank_tile = Tile.get_blank_tile(key) cache(blank_tile) else: blank_tile = blanktilecache[key] else: blank_tile = None else: blank_tile = None # return deepcopy(blank_tile) return cPickle.loads(cPickle.dumps(blank_tile, -1)) def check_required_nodes(tile, required_nodes, all_nodes): # Check that each required node in a tile is populated. errors = [] if len(required_nodes) > 0: if target_tile.data != {}: for target_k, target_v in target_tile.data.iteritems( ): if target_k in required_nodes.keys( ) and target_v is None: populated_tiles.pop( populated_tiles.index(target_tile)) errors.append({ 'type': 'WARNING', 'message': 'The {0} node is required and must be populated in order to populate the {1} nodes. This data was not imported.' .format( required_nodes[target_k], ', '.join( all_nodes. filter(nodegroup_id=str( target_tile.nodegroup_id )).values_list('name', flat=True))) }) elif target_tile.tiles != None: for tile in tiles: check_required_nodes(tile) if len(errors) > 0: self.errors += errors resources = [] for row_number, row in enumerate(business_data): row_number = 'on line ' + unicode( row_number + 2 ) #to represent the row in a csv accounting for the header and 0 index if row['ResourceID'] != previous_row_resourceid and previous_row_resourceid is not None: save_count = save_count + 1 self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) # reset values for next resource instance populated_tiles = [] resourceinstanceid = process_resourceid( row['ResourceID'], overwrite) populated_nodegroups[resourceinstanceid] = [] source_data = column_names_to_targetids( row, mapping, row_number) if len(source_data) > 0: if source_data[0].keys(): try: target_resource_model = all_nodes.get( nodeid=source_data[0].keys()[0]).graph_id except: print '*' * 80 print 'ERROR: No resource model found. Please make sure the resource model this business data is mapped to has been imported into Arches.' print '*' * 80 sys.exit() target_tile = get_blank_tile(source_data) def populate_tile(source_data, target_tile): ''' source_data = [{nodeid:value},{nodeid:value},{nodeid:value} . . .] All nodes in source_data belong to the same resource. A dictionary of nodeids would not allow for multiple values for the same nodeid. Grouping is enforced by having all grouped attributes in the same row. ''' need_new_tile = False # Set target tileid to None because this will be a new tile, a new tileid will be created on save. target_tile.tileid = uuid.uuid4() target_tile.resourceinstance_id = resourceinstanceid # Check the cardinality of the tile and check if it has been populated. # If cardinality is one and the tile is populated the tile should not be populated again. if str(target_tile.nodegroup_id ) in single_cardinality_nodegroups: target_tile_cardinality = '1' else: target_tile_cardinality = 'n' if str( target_tile.nodegroup_id ) not in populated_nodegroups[resourceinstanceid]: # Check if we are populating a parent tile by inspecting the target_tile.data array. if target_tile.data != {}: # Iterate through the target_tile nodes and begin populating by iterating througth source_data array. # The idea is to populate as much of the target_tile as possible, before moving on to the next target_tile. for target_key in target_tile.data.keys(): for source_tile in source_data: for source_key in source_tile.keys( ): # Check for source and target key match. if source_key == target_key: if target_tile.data[ source_key] == None: # If match populate target_tile node with transformed value. value = transform_value( node_datatypes[ source_key], source_tile[ source_key], row_number, source_key) target_tile.data[ source_key] = value[ 'value'] # target_tile.request = value['request'] # Delete key from source_tile so we do not populate another tile based on the same data. del source_tile[ source_key] # Cleanup source_data array to remove source_tiles that are now '{}' from the code above. source_data[:] = [ item for item in source_data if item != {} ] # Check if we are populating a child tile(s) by inspecting the target_tiles.tiles array. elif target_tile.tiles != None: populated_child_nodegroups = [] for nodegroupid, childtile in target_tile.tiles.iteritems( ): prototype_tile = childtile.pop() if str( prototype_tile.nodegroup_id ) in single_cardinality_nodegroups: child_tile_cardinality = '1' else: child_tile_cardinality = 'n' def populate_child_tiles(source_data): prototype_tile_copy = cPickle.loads( cPickle.dumps( prototype_tile, -1)) prototype_tile_copy.tileid = uuid.uuid4( ) prototype_tile_copy.parenttile = target_tile prototype_tile_copy.resourceinstance_id = resourceinstanceid if str( prototype_tile_copy. nodegroup_id ) not in populated_child_nodegroups: for target_key in prototype_tile_copy.data.keys( ): for source_column in source_data: for source_key in source_column.keys( ): if source_key == target_key: if prototype_tile_copy.data[ source_key] == None: value = transform_value( node_datatypes[ source_key], source_column[ source_key], row_number, source_key ) prototype_tile_copy.data[ source_key] = value[ 'value'] # target_tile.request = value['request'] del source_column[ source_key] else: populate_child_tiles( source_data ) if prototype_tile_copy.data != {}: if len([ item for item in prototype_tile_copy. data.values() if item != None ]) > 0: if str( prototype_tile_copy .nodegroup_id ) not in populated_child_nodegroups: childtile.append( prototype_tile_copy ) if prototype_tile_copy != None: if child_tile_cardinality == '1': populated_child_nodegroups.append( str(prototype_tile_copy .nodegroup_id)) source_data[:] = [ item for item in source_data if item != {} ] populate_child_tiles(source_data) if not target_tile.is_blank(): populated_tiles.append(target_tile) if len(source_data) > 0: need_new_tile = True if target_tile_cardinality == '1': populated_nodegroups[ resourceinstanceid].append( str(target_tile.nodegroup_id)) if need_new_tile: new_tile = get_blank_tile(source_data) if new_tile != None: populate_tile(source_data, new_tile) # mock_request_object = HttpRequest() if target_tile != None and len(source_data) > 0: populate_tile(source_data, target_tile) # Check that required nodes are populated. If not remove tile from populated_tiles array. check_required_nodes(target_tile, required_nodes, all_nodes) previous_row_resourceid = row['ResourceID'] legacyid = row['ResourceID'] if 'legacyid' in locals(): self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) if bulk: Resource.bulk_save(resources=resources) print _('%s total resource saved' % (save_count + 1)) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print message finally: pass
def import_business_data(self, business_data=None, mapping=None, overwrite='append', bulk=False): # errors = businessDataValidator(self.business_data) def process_resourceid(resourceid, overwrite): # Test if resourceid is a UUID. try: resourceinstanceid = uuid.UUID(resourceid) # If resourceid is a UUID check if it is already an arches resource. try: ret = Resource.objects.filter(resourceinstanceid=resourceid) # If resourceid is an arches resource and overwrite is true, delete the existing arches resource. if overwrite == 'overwrite': Resource(str(ret[0].resourceinstanceid)).delete() resourceinstanceid = resourceinstanceid # If resourceid is not a UUID create one. except: resourceinstanceid = resourceinstanceid except: # Get resources with the given legacyid ret = Resource.objects.filter(legacyid=resourceid) # If more than one resource is returned than make resource = None. This should never actually happen. if len(ret) > 1: resourceinstanceid = None # If no resource is returned with the given legacyid then create an archesid for the resource. elif len(ret) == 0: resourceinstanceid = uuid.uuid4() # If a resource is returned with the give legacyid then return its archesid else: if overwrite == 'overwrite': Resource(str(ret[0].resourceinstanceid)).delete() resourceinstanceid = ret[0].resourceinstanceid return resourceinstanceid try: with transaction.atomic(): save_count = 0 try: resourceinstanceid = process_resourceid(business_data[0]['ResourceID'], overwrite) except KeyError: print '*'*80 print 'ERROR: No column \'ResourceID\' found in business data file. Please add a \'ResourceID\' column with a unique resource identifier.' print '*'*80 sys.exit() blanktilecache = {} populated_nodegroups = {} populated_nodegroups[resourceinstanceid] = [] previous_row_resourceid = None populated_tiles = [] single_cardinality_nodegroups = [str(nodegroupid) for nodegroupid in NodeGroup.objects.values_list('nodegroupid', flat=True).filter(cardinality = '1')] node_datatypes = {str(nodeid): datatype for nodeid, datatype in Node.objects.values_list('nodeid', 'datatype').filter(~Q(datatype='semantic'), graph__isresource=True)} all_nodes = Node.objects.all() datatype_factory = DataTypeFactory() # This code can probably be moved into it's own module. resourceids = [] non_contiguous_resource_ids = [] previous_row_for_validation = None for row_number, row in enumerate(business_data): # Check contiguousness of csv file. if row['ResourceID'] != previous_row_for_validation and row['ResourceID'] in resourceids: non_contiguous_resource_ids.append(row['ResourceID']) else: resourceids.append(row['ResourceID']) previous_row_for_validation = row['ResourceID'] if len(non_contiguous_resource_ids) > 0: print '*'*80 print 'ERROR: Resources in your csv file are non-contiguous. Please sort your csv file by ResourceID and try import again.' print '*'*80 sys.exit() def cache(blank_tile): if blank_tile.data != {}: for key in blank_tile.data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile else: for nodegroup, tile in blank_tile.tiles.iteritems(): for key in tile[0].data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile def column_names_to_targetids(row, mapping, row_number): errors = [] new_row = [] if 'ADDITIONAL' in row or 'MISSING' in row: errors.append({'type': 'WARNING', 'message': 'No resource created for ResourceID {0}. Line {1} has additional or missing columns.'.format(row['ResourceID'], str(int(row_number.split('on line ')[1])))}) if len(errors) > 0: self.errors += errors for key, value in row.iteritems(): if value != '': for row in mapping['nodes']: if key.upper() == row['file_field_name'].upper(): new_row.append({row['arches_nodeid']: value}) return new_row def transform_value(datatype, value, source): ''' Transforms values from probably string/wkt representation to specified datatype in arches. This code could probably move to somehwere where it can be accessed by other importers. ''' request = '' if datatype != '': errors = [] datatype_instance = datatype_factory.get_instance(datatype) try: value = datatype_instance.transform_import_values(value) errors = datatype_instance.validate(value, source) except Exception as e: errors.append({'type': 'ERROR', 'message': 'datatype: {0} value: {1} {2} - {3}'.format(datatype_instance.datatype_model.classname, value, source, e)}) if len(errors) > 0: self.errors += errors else: print _('No datatype detected for {0}'.format(value)) return {'value': value, 'request': request} def get_blank_tile(source_data): if len(source_data) > 0: if source_data[0] != {}: key = str(source_data[0].keys()[0]) if key not in blanktilecache: blank_tile = Tile.get_blank_tile(key) cache(blank_tile) else: blank_tile = blanktilecache[key] else: blank_tile = None else: blank_tile = None # return deepcopy(blank_tile) return cPickle.loads(cPickle.dumps(blank_tile, -1)) resources = [] for row_number, row in enumerate(business_data): row_number = 'on line ' + unicode(row_number + 2) #to represent the row in a csv accounting for the header and 0 index if row['ResourceID'] != previous_row_resourceid and previous_row_resourceid is not None: save_count = save_count + 1 self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) # reset values for next resource instance populated_tiles = [] resourceinstanceid = process_resourceid(row['ResourceID'], overwrite) populated_nodegroups[resourceinstanceid] = [] source_data = column_names_to_targetids(row, mapping, row_number) if len(source_data) > 0: if source_data[0].keys(): try: target_resource_model = all_nodes.get(nodeid=source_data[0].keys()[0]).graph_id except: print '*'*80 print 'ERROR: No resource model found. Please make sure the resource model this business data is mapped to has been imported into Arches.' print '*'*80 sys.exit() target_tile = get_blank_tile(source_data) def populate_tile(source_data, target_tile): ''' source_data = [{nodeid:value},{nodeid:value},{nodeid:value} . . .] All nodes in source_data belong to the same resource. A dictionary of nodeids would not allow for multiple values for the same nodeid. Grouping is enforced by having all grouped attributes in the same row. ''' need_new_tile = False # Set target tileid to None because this will be a new tile, a new tileid will be created on save. target_tile.tileid = uuid.uuid4() target_tile.resourceinstance_id = resourceinstanceid # Check the cardinality of the tile and check if it has been populated. # If cardinality is one and the tile is populated the tile should not be populated again. if str(target_tile.nodegroup_id) in single_cardinality_nodegroups: target_tile_cardinality = '1' else: target_tile_cardinality = 'n' if str(target_tile.nodegroup_id) not in populated_nodegroups[resourceinstanceid]: # Check if we are populating a parent tile by inspecting the target_tile.data array. if target_tile.data != {}: # Iterate through the target_tile nodes and begin populating by iterating througth source_data array. # The idea is to populate as much of the target_tile as possible, before moving on to the next target_tile. for target_key in target_tile.data.keys(): for source_tile in source_data: for source_key in source_tile.keys(): # Check for source and target key match. if source_key == target_key: if target_tile.data[source_key] == None: # If match populate target_tile node with transformed value. value = transform_value(node_datatypes[source_key], source_tile[source_key], row_number) target_tile.data[source_key] = value['value'] # target_tile.request = value['request'] # Delete key from source_tile so we do not populate another tile based on the same data. del source_tile[source_key] # Cleanup source_data array to remove source_tiles that are now '{}' from the code above. source_data[:] = [item for item in source_data if item != {}] # Check if we are populating a child tile(s) by inspecting the target_tiles.tiles array. elif target_tile.tiles != None: populated_child_nodegroups = [] for nodegroupid, childtile in target_tile.tiles.iteritems(): prototype_tile = childtile.pop() if str(prototype_tile.nodegroup_id) in single_cardinality_nodegroups: child_tile_cardinality = '1' else: child_tile_cardinality = 'n' def populate_child_tiles(source_data): prototype_tile_copy = cPickle.loads(cPickle.dumps(prototype_tile, -1)) prototype_tile_copy.tileid = uuid.uuid4() prototype_tile_copy.parenttile = target_tile prototype_tile_copy.resourceinstance_id = resourceinstanceid if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: for target_key in prototype_tile_copy.data.keys(): for source_column in source_data: for source_key in source_column.keys(): if source_key == target_key: if prototype_tile_copy.data[source_key] == None: value = transform_value(node_datatypes[source_key], source_column[source_key], row_number) prototype_tile_copy.data[source_key] = value['value'] # target_tile.request = value['request'] del source_column[source_key] else: populate_child_tiles(source_data) if prototype_tile_copy.data != {}: if len([item for item in prototype_tile_copy.data.values() if item != None]) > 0: if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: childtile.append(prototype_tile_copy) if prototype_tile_copy != None: if child_tile_cardinality == '1': populated_child_nodegroups.append(str(prototype_tile_copy.nodegroup_id)) source_data[:] = [item for item in source_data if item != {}] populate_child_tiles(source_data) populated_tiles.append(target_tile) if len(source_data)>0: need_new_tile = True if target_tile_cardinality == '1': populated_nodegroups[resourceinstanceid].append(str(target_tile.nodegroup_id)) if need_new_tile: new_tile = get_blank_tile(source_data) if new_tile != None: populate_tile(source_data, new_tile) # mock_request_object = HttpRequest() if target_tile != None and len(source_data) > 0: populate_tile(source_data, target_tile) previous_row_resourceid = row['ResourceID'] legacyid = row['ResourceID'] if 'legacyid' in locals(): self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count) if bulk: Resource.bulk_save(resources=resources) print _('%s total resource saved' % (save_count + 1)) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print message finally: pass
def import_business_data(self, business_data=None, mapping=None, overwrite='append', bulk=False, create_concepts=False, create_collections=False): # errors = businessDataValidator(self.business_data) def get_display_nodes(graphid): display_nodeids = [] functions = FunctionXGraph.objects.filter(function_id='60000000-0000-0000-0000-000000000001', graph_id=graphid) for function in functions: f = function.config del f['triggering_nodegroups'] for k,v in f.iteritems(): v['node_ids'] = [] v['string_template'] = v['string_template'].replace('<', '').replace('>', '').split(', ') if 'nodegroup_id' in v and v['nodegroup_id'] != '': nodes = Node.objects.filter(nodegroup_id=v['nodegroup_id']) for node in nodes: if node.name in v['string_template']: display_nodeids.append(str(node.nodeid)) for k,v in f.iteritems(): if 'string_template' in v and v['string_template'] != ['']: print 'The {0} {1} in the {2} display function.'.format(', '.join(v['string_template']), 'nodes participate' if len(v['string_template']) > 1 else 'node participates', k) else: print 'No nodes participate in the {0} display function.'.format(k) return display_nodeids def process_resourceid(resourceid, overwrite): # Test if resourceid is a UUID. try: resourceinstanceid = uuid.UUID(resourceid) # If resourceid is a UUID check if it is already an arches resource. try: ret = Resource.objects.filter(resourceinstanceid=resourceid) # If resourceid is an arches resource and overwrite is true, delete the existing arches resource. if overwrite == 'overwrite': Resource.objects.get(pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = resourceinstanceid # If resourceid is not a UUID create one. except: resourceinstanceid = resourceinstanceid except: # Get resources with the given legacyid ret = Resource.objects.filter(legacyid=resourceid) # If more than one resource is returned than make resource = None. This should never actually happen. if len(ret) > 1: resourceinstanceid = None # If no resource is returned with the given legacyid then create an archesid for the resource. elif len(ret) == 0: resourceinstanceid = uuid.uuid4() # If a resource is returned with the give legacyid then return its archesid else: if overwrite == 'overwrite': Resource.objects.get(pk=str(ret[0].resourceinstanceid)).delete() resourceinstanceid = ret[0].resourceinstanceid return resourceinstanceid try: with transaction.atomic(): save_count = 0 try: resourceinstanceid = process_resourceid(business_data[0]['ResourceID'], overwrite) except KeyError: print '*'*80 print 'ERROR: No column \'ResourceID\' found in business data file. Please add a \'ResourceID\' column with a unique resource identifier.' print '*'*80 sys.exit() blanktilecache = {} populated_nodegroups = {} populated_nodegroups[resourceinstanceid] = [] previous_row_resourceid = None populated_tiles = [] target_resource_model = None single_cardinality_nodegroups = [str(nodegroupid) for nodegroupid in NodeGroup.objects.values_list('nodegroupid', flat=True).filter(cardinality = '1')] node_datatypes = {str(nodeid): datatype for nodeid, datatype in Node.objects.values_list('nodeid', 'datatype').filter(~Q(datatype='semantic'), graph__isresource=True)} display_nodes = get_display_nodes(mapping['resource_model_id']) all_nodes = Node.objects.all() datatype_factory = DataTypeFactory() concepts_to_create = {} new_concepts = {} required_nodes = {} for node in Node.objects.filter(~Q(datatype='semantic'), isrequired=True, graph_id=mapping['resource_model_id']).values_list('nodeid', 'name'): required_nodes[str(node[0])] = node[1] # This code can probably be moved into it's own module. resourceids = [] non_contiguous_resource_ids = [] previous_row_for_validation = None for row_number, row in enumerate(business_data): # Check contiguousness of csv file. if row['ResourceID'] != previous_row_for_validation and row['ResourceID'] in resourceids: non_contiguous_resource_ids.append(row['ResourceID']) else: resourceids.append(row['ResourceID']) previous_row_for_validation = row['ResourceID'] if create_concepts == True: for node in mapping['nodes']: if node['data_type'] in ['concept', 'concept-list', 'domain-value', 'domain-value-list'] and node['file_field_name'] in row.keys(): # print row[node['file_field_name']] concept = [] for val in csv.reader([row[node['file_field_name']]], delimiter=',', quotechar='"'): concept.append(val) concept = concept[0] # check if collection is in concepts_to_create, add collection to concepts_to_create if it's not and add first child concept if node['arches_nodeid'] not in concepts_to_create: concepts_to_create[node['arches_nodeid']] = {} for concept_value in concept: concepts_to_create[node['arches_nodeid']][str(uuid.uuid4())] = concept_value # if collection in concepts to create then add child concept to collection elif row[node['file_field_name']] not in concepts_to_create[node['arches_nodeid']].values(): for concept_value in concept: concepts_to_create[node['arches_nodeid']][str(uuid.uuid4())] = concept_value if len(non_contiguous_resource_ids) > 0: print '*'*80 for non_contiguous_resource_id in non_contiguous_resource_ids: print 'ResourceID: ' + non_contiguous_resource_id print 'ERROR: The preceding ResourceIDs are non-contiguous in your csv file. Please sort your csv file by ResourceID and try import again.' print '*'*80 sys.exit() def create_reference_data(new_concepts, create_collections): errors = [] candidates = Concept().get(id='00000000-0000-0000-0000-000000000006') for arches_nodeid, concepts in new_concepts.iteritems(): collectionid = str(uuid.uuid4()) topconceptid = str(uuid.uuid4()) node = Node.objects.get(nodeid=arches_nodeid) # if node.datatype is concept or concept-list create concepts and collections if node.datatype in ['concept', 'concept-list']: # create collection if create_collections = create, otherwise append to collection already assigned to node if create_collections == True: collection_legacyoid = node.name + '_' + str(node.graph_id) + '_import' # check to see that there is not already a collection for this node if node.config['rdmCollection'] != None: errors.append({'type': 'WARNING', 'message': 'A collection already exists for the {0} node. Use the add option to add concepts to this collection.'.format(node.name)}) if len(errors) > 0: self.errors += errors collection = None else: # if there is no collection assigned to this node, create one and assign it to the node try: # check to see that a collection with this legacyid does not already exist collection = Concept().get(legacyoid=collection_legacyoid) errors.append({'type': 'WARNING', 'message': 'A collection with the legacyid {0} already exists.'.format(node.name + '_' + str(node.graph_id) + '_import')}) if len(errors) > 0: self.errors += errors except: collection = Concept({ 'id': collectionid, 'legacyoid': collection_legacyoid, 'nodetype': 'Collection' }) collection.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) node.config['rdmCollection'] = collectionid node.save() collection.save() else: # if create collection = add check that there is a collection associated with node, if no collection associated with node create a collection and associated with the node try: collection = Concept().get(id=node.config['rdmCollection']) except: collection = Concept({ 'id': collectionid, 'legacyoid': node.name + '_' + str(node.graph_id) + '_import', 'nodetype': 'Collection' }) collection.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) node.config['rdmCollection'] = collectionid node.save() collection.save() if collection != None: topconcept_legacyoid = node.name + '_' + str(node.graph_id) # Check if top concept already exists, if not create it and add to candidates scheme try: topconcept = Concept().get(legacyoid=topconcept_legacyoid) except: topconcept = Concept({ 'id': topconceptid, 'legacyoid': topconcept_legacyoid, 'nodetype': 'Concept' }) topconcept.addvalue({'id': str(uuid.uuid4()), 'value': node.name + '_import', 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) topconcept.save() candidates.add_relation(topconcept, 'narrower') # create child concepts and relate to top concept and collection accordingly for conceptid, value in concepts.iteritems(): concept_legacyoid = value + '_' + node.name + '_' + str(node.graph_id) # check if concept already exists, if not create and add to topconcept and collection try: conceptid = [concept for concept in topconcept.get_child_concepts(topconcept.id) if concept[1] == value][0][0] concept = Concept().get(id=conceptid) except: concept = Concept({ 'id': conceptid, 'legacyoid': concept_legacyoid, 'nodetype': 'Concept' }) concept.addvalue({'id': str(uuid.uuid4()), 'value': value, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel'}) concept.save() collection.add_relation(concept, 'member') topconcept.add_relation(concept, 'narrower') #if node.datatype is domain or domain-list create options array in node.config elif node.datatype in ['domain-value', 'domain-value-list']: for domainid, value in new_concepts[arches_nodeid].iteritems(): # check if value already exists in domain if value not in [t['text'] for t in node.config['options']]: domainvalue = { "text": value, "selected": False, "id": domainid } node.config['options'].append(domainvalue) node.save() if create_concepts == True: create_reference_data(concepts_to_create, create_collections) # if concepts are created on import concept_lookup must be instatiated afterward concept_lookup = ConceptLookup() def cache(blank_tile): if blank_tile.data != {}: for key in blank_tile.data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile else: for tile in blank_tile.tiles: for key in tile.data.keys(): if key not in blanktilecache: blanktilecache[str(key)] = blank_tile def column_names_to_targetids(row, mapping, row_number): errors = [] new_row = [] if 'ADDITIONAL' in row or 'MISSING' in row: errors.append({'type': 'WARNING', 'message': 'No resource created for ResourceID {0}. Line {1} has additional or missing columns.'.format(row['ResourceID'], str(int(row_number.split('on line ')[1])))}) if len(errors) > 0: self.errors += errors for key, value in row.iteritems(): if value != '': for row in mapping['nodes']: if key.upper() == row['file_field_name'].upper(): new_row.append({row['arches_nodeid']: value}) return new_row def transform_value(datatype, value, source, nodeid): ''' Transforms values from probably string/wkt representation to specified datatype in arches. This code could probably move to somehwere where it can be accessed by other importers. ''' request = '' if datatype != '': errors = [] datatype_instance = datatype_factory.get_instance(datatype) if datatype in ['concept', 'domain-value', 'concept-list', 'domain-value-list']: try: uuid.UUID(value) except: if datatype in ['domain-value', 'domain-value-list']: collection_id = nodeid else: collection_id = Node.objects.get(nodeid=nodeid).config['rdmCollection'] if collection_id != None: value = concept_lookup.lookup_labelid_from_label(value, collection_id) try: value = datatype_instance.transform_import_values(value, nodeid) errors = datatype_instance.validate(value, row_number, source) except Exception as e: errors.append({'type': 'ERROR', 'message': 'datatype: {0} value: {1} {2} - {3}'.format(datatype_instance.datatype_model.classname, value, source, str(e) + ' or is not a prefLabel in the given collection.')}) if len(errors) > 0: error_types = [error['type'] for error in errors] if 'ERROR' in error_types: value = None self.errors += errors else: print _('No datatype detected for {0}'.format(value)) return {'value': value, 'request': request} def get_blank_tile(source_data): if len(source_data) > 0: if source_data[0] != {}: key = str(source_data[0].keys()[0]) if key not in blanktilecache: blank_tile = Tile.get_blank_tile(key) cache(blank_tile) else: blank_tile = blanktilecache[key] else: blank_tile = None else: blank_tile = None # return deepcopy(blank_tile) return cPickle.loads(cPickle.dumps(blank_tile, -1)) def check_required_nodes(tile, parent_tile, required_nodes, all_nodes): # Check that each required node in a tile is populated. errors = [] if len(required_nodes) > 0: if bool(tile.data): for target_k, target_v in tile.data.iteritems(): if target_k in required_nodes.keys() and target_v is None: if parent_tile in populated_tiles: populated_tiles.pop(populated_tiles.index(parent_tile)) errors.append({'type': 'WARNING', 'message': 'The {0} node is required and must be populated in order to populate the {1} nodes. This data was not imported.'.format(required_nodes[target_k], ', '.join(all_nodes.filter(nodegroup_id=str(target_tile.nodegroup_id)).values_list('name', flat=True)))}) elif bool(tile.tiles): for tile in tile.tiles: check_required_nodes(tile, parent_tile, required_nodes, all_nodes) if len(errors) > 0: self.errors += errors resources = [] missing_display_values = {} for row_number, row in enumerate(business_data): row_number = 'on line ' + unicode(row_number + 2) #to represent the row in a csv accounting for the header and 0 index if row['ResourceID'] != previous_row_resourceid and previous_row_resourceid is not None: save_count = save_count + 1 self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count, row_number) # reset values for next resource instance populated_tiles = [] resourceinstanceid = process_resourceid(row['ResourceID'], overwrite) populated_nodegroups[resourceinstanceid] = [] source_data = column_names_to_targetids(row, mapping, row_number) row_keys = [list(b) for b in zip(*[a.keys() for a in source_data])] if len(row_keys) > 0: missing_display_nodes = [n for n in display_nodes if n not in row_keys] if len(missing_display_nodes) > 0: errors = [] for mdn in missing_display_nodes: mdn_name = all_nodes.filter(nodeid=mdn).values_list('name', flat=True)[0] try: missing_display_values[mdn_name].append(row_number.split('on line ')[-1]) except: missing_display_values[mdn_name] = [row_number.split('on line ')[-1]] if len(source_data) > 0: if source_data[0].keys(): try: target_resource_model = all_nodes.get(nodeid=source_data[0].keys()[0]).graph_id except: print '*'*80 print 'ERROR: No resource model found. Please make sure the resource model this business data is mapped to has been imported into Arches.' print '*'*80 sys.exit() target_tile = get_blank_tile(source_data) if 'TileID' in row and row['TileID'] is not None: target_tile.tileid = row['TileID'] if 'NodeGroupID' in row and row['NodeGroupID'] is not None: target_tile.nodegroupid = row['NodeGroupID'] def populate_tile(source_data, target_tile): ''' source_data = [{nodeid:value},{nodeid:value},{nodeid:value} . . .] All nodes in source_data belong to the same resource. A dictionary of nodeids would not allow for multiple values for the same nodeid. Grouping is enforced by having all grouped attributes in the same row. ''' need_new_tile = False # Set target tileid to None because this will be a new tile, a new tileid will be created on save. target_tile.tileid = uuid.uuid4() if 'TileID' in row and row['TileID'] is not None: target_tile.tileid = row['TileID'] target_tile.resourceinstance_id = resourceinstanceid # Check the cardinality of the tile and check if it has been populated. # If cardinality is one and the tile is populated the tile should not be populated again. if str(target_tile.nodegroup_id) in single_cardinality_nodegroups and 'TileiD' not in row: target_tile_cardinality = '1' else: target_tile_cardinality = 'n' if str(target_tile.nodegroup_id) not in populated_nodegroups[resourceinstanceid]: target_tile.nodegroup_id = str(target_tile.nodegroup_id) # Check if we are populating a parent tile by inspecting the target_tile.data array. if target_tile.data != {}: # Iterate through the target_tile nodes and begin populating by iterating througth source_data array. # The idea is to populate as much of the target_tile as possible, before moving on to the next target_tile. for target_key in target_tile.data.keys(): for source_tile in source_data: for source_key in source_tile.keys(): # Check for source and target key match. if source_key == target_key: if target_tile.data[source_key] == None: # If match populate target_tile node with transformed value. value = transform_value(node_datatypes[source_key], source_tile[source_key], row_number, source_key) target_tile.data[source_key] = value['value'] # target_tile.request = value['request'] # Delete key from source_tile so we do not populate another tile based on the same data. del source_tile[source_key] # Cleanup source_data array to remove source_tiles that are now '{}' from the code above. source_data[:] = [item for item in source_data if item != {}] # Check if we are populating a child tile(s) by inspecting the target_tiles.tiles array. elif target_tile.tiles != None: populated_child_tiles = [] populated_child_nodegroups = [] for childtile in target_tile.tiles: if str(childtile.nodegroup_id) in single_cardinality_nodegroups: child_tile_cardinality = '1' else: child_tile_cardinality = 'n' def populate_child_tiles(source_data): prototype_tile_copy = cPickle.loads(cPickle.dumps(childtile, -1)) tileid = row['TileID'] if 'TileID' in row else uuid.uuid4() prototype_tile_copy.tileid = tileid prototype_tile_copy.parenttile = target_tile parenttileid = row['ParentTileID'] if 'ParentTileID' in row and row['ParentTileID'] is not None else None if parenttileid is not None: prototype_tile_copy.parenttile.tileid = parenttileid prototype_tile_copy.resourceinstance_id = resourceinstanceid if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: prototype_tile_copy.nodegroup_id = str(prototype_tile_copy.nodegroup_id) for target_key in prototype_tile_copy.data.keys(): for source_column in source_data: for source_key in source_column.keys(): if source_key == target_key: if prototype_tile_copy.data[source_key] == None: value = transform_value(node_datatypes[source_key], source_column[source_key], row_number, source_key) prototype_tile_copy.data[source_key] = value['value'] # print prototype_tile_copy.data[source_key] # print '&'*80 # target_tile.request = value['request'] del source_column[source_key] else: populate_child_tiles(source_data) if prototype_tile_copy.data != {}: if len([item for item in prototype_tile_copy.data.values() if item != None]) > 0: if str(prototype_tile_copy.nodegroup_id) not in populated_child_nodegroups: populated_child_tiles.append(prototype_tile_copy) if prototype_tile_copy != None: if child_tile_cardinality == '1' and 'NodeGroupID' not in row: populated_child_nodegroups.append(str(prototype_tile_copy.nodegroup_id)) source_data[:] = [item for item in source_data if item != {}] populate_child_tiles(source_data) target_tile.tiles = populated_child_tiles if not target_tile.is_blank(): populated_tiles.append(target_tile) if len(source_data)>0: need_new_tile = True if target_tile_cardinality == '1' and 'NodeGroupID' not in row: populated_nodegroups[resourceinstanceid].append(str(target_tile.nodegroup_id)) if need_new_tile: new_tile = get_blank_tile(source_data) if new_tile != None: populate_tile(source_data, new_tile) # mock_request_object = HttpRequest() if target_tile != None and len(source_data) > 0: populate_tile(source_data, target_tile) # Check that required nodes are populated. If not remove tile from populated_tiles array. check_required_nodes(target_tile, target_tile, required_nodes, all_nodes) previous_row_resourceid = row['ResourceID'] legacyid = row['ResourceID'] # check for missing display value nodes. errors = [] for k,v in missing_display_values.iteritems(): if len(v) > 0: errors.append({'type': 'WARNING', 'message': '{0} is null or not mapped on rows {1} and participates in a display value function.'.format(k, ','.join(v))}) if len(errors) > 0: self.errors += errors if 'legacyid' in locals(): self.save_resource(populated_tiles, resourceinstanceid, legacyid, resources, target_resource_model, bulk, save_count, row_number) if bulk: Resource.bulk_save(resources=resources) print _('%s total resource saved' % (save_count + 1)) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print message finally: pass