def handle(self, *args, **options): if not settings.NMTK_SERVER: raise CommandError('The NMTK Server is not currently enabled') for m in models.DataFile.objects.filter(srid__gte=0): data_qs = data_output.getQuerySet(m) extent = data_qs.extent() geometry = Polygon.from_bbox(extent) m.extent = str(geometry) m.save()
def importDataFile(datafile, job_id=None): from NMTK_server import models datafile.status_message = None try: loader = NMTKDataLoader(datafile.file.path, srid=datafile.srid) destination = None for import_file in loader.extract_files(): # Figure out where these files need to go. if not destination: destination = os.path.dirname(datafile.file.path) # the first file we get (when destination is null,it's our first # loop) is the one that needs to be in the model, handle that # here... if datafile.file.path != import_file: f = open(import_file) datafile.file.save(os.path.basename(import_file), File(f)) else: shutil.copyfile(import_file, os.path.join(destination, os.path.basename(import_file))) logger.debug('Created a new file for %s', import_file) if loader.is_spatial: datafile.srid = loader.info.srid datafile.srs = loader.info.srs datafile.geom_type = loader.info.type logger.debug('Loader extent is %s', loader.info.extent) extent = geos.Polygon.from_bbox(loader.info.extent) logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid, extent,) if datafile.srid: extent.srid = int(loader.info.srid) extent.transform(4326) logger.debug("Extent is 'srid=%s;%s'::geometry", 4326, extent,) datafile.extent = extent datafile.feature_count = loader.info.feature_count if not datafile.description: datafile.description = loader.info.format if loader.is_spatial and not datafile.srid: datafile.status = datafile.IMPORT_FAILED datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)' elif not job_id: datafile.status = datafile.IMPORTED else: datafile.status = datafile.IMPORT_RESULTS_COMPLETE datafile.fields = loader.info.fields # Create an empty file using ContentFile, then we can overwrite it # with the desired GeoJSON data. if loader.is_spatial: suffix = 'geojson' else: suffix = 'json' if datafile.status in ( datafile.IMPORTED, datafile.IMPORT_RESULTS_COMPLETE): if datafile.geom_type == 99: field_attributes = {} # This is a raster... for pos, band in enumerate(loader.dl_instance.bands()): field_attributes[pos + 1] = { 'type': band.type, 'field_name': 'pixel', 'min': band.min, 'max': band.max} datafile.field_attributes = field_attributes elif datafile.feature_count: logger.error('Working on saving the model!') datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix), ContentFile('')) loader.export_json(datafile.processed_file.path) generate_datamodel(datafile, loader) # Here we load the spatialite data using the model that was created # by generate_datamodel. We need to use this to get the range # and type information for each field... try: field_attributes = {} qs = getQuerySet(datafile) field_mappings = [(django_model_fields.IntegerField, 'integer', int), # Required because nmtk_id is an # autofield.. (django_model_fields.AutoField, 'integer', int,), (django_model_fields.BooleanField, 'boolean', bool), # Special case holding FIPS (django_model_fields.DecimalField, 'float', float), (django_model_fields.TextField, 'text', None), (django_model_fields.FloatField, 'float', float), (django_model_fields.DateField, 'date', None,), (django_model_fields.TimeField, 'time', None,), (django_model_fields.DateTimeField, 'datetime', None)] if qs.count() > 0: # Get a single row so that we can try to work with the # fields. sample_row = qs[0] for field in sample_row._meta.fields: field_name = field.name db_column = field.db_column or field.name # convert the django field type to a text string. for ftype, field_type, caster in field_mappings: if isinstance(field, (ftype,)): break else: logger.info( 'Unable to map field of type %s (this is expected for GIS fields)', type( field, )) continue values_aggregates = qs.aggregate( Count(field_name, distinct=True)) field_attributes[db_column] = { 'type': field_type, 'field_name': field_name, 'distinct': values_aggregates[ '{0}__count'.format(field_name)]} if field_attributes[db_column]['distinct'] < 10: distinct_values = list( qs.order_by().values_list( field_name, flat=True).distinct()) if not caster: field_attributes[db_column][ 'values'] = distinct_values else: field_attributes[db_column][ 'values'] = map(caster, distinct_values) else: logger.debug( 'There are more than 10 values for %s (%s), enumerating..', db_column, field_attributes[db_column]['distinct']) # formerly the aggregates happened above - with the count. However, Django doesn't # allow those aggregates with boolean fields - so here we split it up to only do the # aggregates in the cases where we have to (i.e., # the distinct values is above the threshold.) values_aggregates = qs.aggregate( Max(field_name), Min(field_name), ) field_attributes[db_column]['min'] = values_aggregates[ '{0}__min'.format(field_name)] field_attributes[db_column]['max'] = values_aggregates[ '{0}__max'.format(field_name)] if caster: field_attributes[db_column]['min'] = caster( field_attributes[db_column]['min']) field_attributes[db_column]['max'] = caster( field_attributes[db_column]['max']) datafile.field_attributes = field_attributes except Exception as e: logger.exception('Failed to get range for model %s', datafile.pk) if job_id: try: job = models.Job.objects.get(pk=job_id) # There might be multiple results files from this job, so we will only # mark the job as complete if all the results files are # processed. if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus.CATEGORY_SYSTEM).save() elif results_left == 1: # Handle the potential race condition here - do we really need this? # sort of. Since it's possible that two files finish post-processing # at the same time. In such cases, a second should be more than enough # time to get both committed as complete. time.sleep(1) job = models.Job.objects.get(pk=job_id) if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus.CATEGORY_SYSTEM).save() except: logger.exception('Failed to update job status to complete?!!') except Exception as e: logger.exception('Failed import process!') datafile.processed_file = None if not job_id: datafile.status = datafile.IMPORT_FAILED else: datafile.status = datafile.IMPORT_RESULTS_FAILED datafile.status_message = "%s" % (e,) if job_id: try: job = models.Job.objects.get(pk=job_id) job.status = job.POST_PROCESSING_FAILED except: logger.exception('Failed to update job status to failed?!!') if job_id: job.save() # Now we need to create the spatialite version of this thing. datafile.save()
def importDataFile(datafile, job_id=None): from NMTK_server import models logger = importDataFile.get_logger() datafile.status_message = None job = None try: loader = NMTKDataLoader(datafile.file.path, srid=datafile.srid, logger=logger) destination = None for import_file in loader.extract_files(): # Figure out where these files need to go. if not destination: destination = os.path.dirname(datafile.file.path) # the first file we get (when destination is null,it's our first # loop) is the one that needs to be in the model, handle that # here... if datafile.file.path != import_file: f = open(import_file) datafile.file.save(os.path.basename(import_file), File(f)) else: shutil.copyfile(import_file, os.path.join(destination, os.path.basename(import_file))) logger.debug('Created a new file for %s', import_file) logger.info('The file is spatial? %s', loader.is_spatial) if loader.is_spatial: datafile.srid = loader.info.srid datafile.srs = loader.info.srs datafile.geom_type = loader.info.type logger.debug('Loader extent is %s', loader.info.extent) extent = geos.Polygon.from_bbox(loader.info.extent) logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid, extent,) if datafile.srid: extent.srid = int(loader.info.srid) extent.transform(4326) logger.debug("Extent is 'srid=%s;%s'::geometry", 4326, extent,) datafile.extent = extent datafile.feature_count = loader.info.feature_count if not datafile.description: datafile.description = loader.info.format future_status = datafile.status if loader.is_spatial and not datafile.srid: future_status = datafile.IMPORT_FAILED datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)' elif not job_id: future_status = datafile.IMPORTED else: future_status = datafile.IMPORT_RESULTS_COMPLETE # We need to merge these things.. desired_field_order = datafile.fields or [] # Now that we have a desired field order from the model, we can # go the next step of getting job data. if job_id: try: job = models.Job.objects.select_related('tool').get(pk=job_id) except Exception as e: logger.error('Failed to get job with id of %s', job_id, exc_info=True) # From the job data we can get the tool config: config_field_list = config_namespace = None # Get the list of field names, with the unique ones first... tool_config_field_units = {} job_config_field_units = datafile.units or {} if job: tool_config = job.tool.toolconfig.json_config # there might be multiple input files, but we'll use the first # one as the basis for format for the output, since we don't # really have a better option. The tool developer ought to # specify a list of fields in the output if they don't like # this behaviour, since this is just a "default" for the order. for t in job.tool.toolconfig.json_config['input']: if t.get('type', '').lower() == 'file': config_namespace = t.get('name', None) if config_namespace: config_field_list = [f['name'] for f in t.get('elements', []) if isinstance(f.get('name', None), (str, unicode))] # If there are units, then we store the units # here, so we can use that with the field data. for f in t.get('elements', []): if 'units' in f: tool_config_field_units[ f['name']] = f.get('units', None) elif 'description' in f: tool_config_field_units[f['name']] = f.get( 'description', None) break # Now that we have a list of fields from the tool configuration, # get the input fields from the file for each of the tool fields, # since we want that to be the default order of output. if config_field_list: job_config = job.config[config_namespace] for f in config_field_list: if f in job_config: if job_config[f].get('type', None) == 'property': if isinstance(job_config[f].get('value', None), (str, unicode)): desired_field_order.append( job_config[f]['value']) # Map the tool config field (f) to the selected data file field # (job_config[f]['value'] so we can grab the units from the # tool config. if (datafile.units and f in datafile.units and 'value' in job_config[f]): job_config_field_units[ job_config[f]['value']] = datafile.units.get(f, '') # If the tool didn't give us the units to use for fields # we can fall back to the tool config to see what they # ought to be. elif (f in tool_config_field_units and 'value' in job_config[f]): job_config_field_units[ job_config[f]['value']] = tool_config_field_units.get(f, '') # Get the list of actual fields in the input datafile... available_fields = loader.info.fields # eliminate fields that are not in the list of output fields. logger.debug('Desired field order is: %s', desired_field_order) logger.debug('Loader provided field order is: %s', available_fields) ordered_fields = [field for field in desired_field_order if field in available_fields] # Add in any fields using the order first, then following with # any fields not in the ordered list, but in the output list # of fields. datafile.fields = list(unique_everseen( ordered_fields + available_fields)) logger.debug('Final field order is %s', datafile.fields) # Create an empty file using ContentFile, then we can overwrite it # with the desired GeoJSON data. if loader.is_spatial: suffix = 'geojson' else: suffix = 'json' if future_status in ( datafile.IMPORTED, datafile.IMPORT_RESULTS_COMPLETE): if datafile.geom_type == 99: field_attributes = {} # This is a raster... for pos, band in enumerate(loader.dl_instance.bands()): field_attributes[pos + 1] = { 'type': band.type, 'field_name': 'pixel', 'min': band.min, 'max': band.max} datafile.field_attributes = field_attributes elif datafile.feature_count: logger.error('Working on saving the model!') datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix), ContentFile('')) loader.export_json(datafile.processed_file.path) try: generate_datamodel(datafile, loader, logger) except Exception as e: logger.error('Error generating data model: %s', e, exc_info=logger.isEnabledFor(logging.DEBUG)) raise e # Here we load the spatialite data using the model that was created # by generate_datamodel. We need to use this to get the range # and type information for each field... try: field_attributes = {} qs = getQuerySet(datafile) field_mappings = [(django_model_fields.IntegerField, 'integer', int), # Required because nmtk_id is an # autofield.. (django_model_fields.AutoField, 'integer', int,), (django_model_fields.BooleanField, 'boolean', bool), # Special case holding FIPS (django_model_fields.DecimalField, 'float', float), (django_model_fields.TextField, 'text', None), (django_model_fields.FloatField, 'float', float), (django_model_fields.DateField, 'date', datetime.date.isoformat,), (django_model_fields.TimeField, 'time', datetime.time.isoformat,), (django_model_fields.DateTimeField, 'datetime', datetime.datetime.isoformat)] if qs.count() > 0: # Get a single row so that we can try to work with the # fields. sample_row = qs[0] for field in sample_row._meta.fields: field_name = field.name db_column = field.db_column or field.name # convert the django field type to a text string. for ftype, field_type, caster in field_mappings: if isinstance(field, (ftype,)): break else: logger.info( 'Unable to map field of type %s (this is expected for GIS fields)', type( field, )) continue values_aggregates = qs.aggregate( Count(field_name, distinct=True)) field_attributes[db_column] = { 'type': field_type, 'field_name': field_name, 'distinct': values_aggregates[ '{0}__count'.format(field_name)]} # Add the units from the config to the data. if db_column in job_config_field_units: field_attributes[db_column][ 'units'] = job_config_field_units[db_column] if field_attributes[db_column]['distinct'] < 10: distinct_values = [v for v in qs.order_by().values_list( field_name, flat=True).distinct() if v is not None] if not caster: field_attributes[db_column][ 'values'] = distinct_values else: logger.info('Attempting to cast values: %s', distinct_values) field_attributes[db_column][ 'values'] = map(caster, distinct_values) else: logger.debug( 'There are more than 10 values for %s (%s), enumerating..', db_column, field_attributes[db_column]['distinct']) # formerly the aggregates happened above - with the count. However, Django doesn't # allow those aggregates with boolean fields - so here we split it up to only do the # aggregates in the cases where we have to (i.e., # the distinct values is above the threshold.) values_aggregates = qs.aggregate( Max(field_name), Min(field_name), ) field_attributes[db_column]['min'] = values_aggregates[ '{0}__min'.format(field_name)] field_attributes[db_column]['max'] = values_aggregates[ '{0}__max'.format(field_name)] if caster: field_attributes[db_column]['min'] = caster( field_attributes[db_column]['min']) field_attributes[db_column]['max'] = caster( field_attributes[db_column]['max']) datafile.field_attributes = field_attributes datafile.units = job_config_field_units except Exception as e: logger.exception('Failed to get range for model %s', datafile.pk) if job: try: # There might be multiple results files from this job, so we will only # mark the job as complete if all the results files are # processed. if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus.CATEGORY_SYSTEM).save() elif results_left == 1: # Handle the potential race condition here - do we really need this? # sort of. Since it's possible that two files finish post-processing # at the same time. In such cases, a second should be more than enough # time to get both committed as complete. time.sleep(1) job = models.Job.objects.get(pk=job_id) if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus.CATEGORY_SYSTEM).save() except: logger.exception('Failed to update job status to complete?!!') datafile.status = future_status except Exception as e: logger.error('Failed import process!', exc_info=True) datafile.processed_file = None if not job_id: datafile.status = datafile.IMPORT_FAILED else: datafile.status = datafile.IMPORT_RESULTS_FAILED datafile.status_message = "%s" % (e,) if job_id: try: if not job: job = models.Job.objects.get(pk=job_id) job.status = job.POST_PROCESSING_FAILED logger.info('Set post processing to failed for job %s', job.pk) except: logger.error( 'Failed to update job status to failed?!!', exc_info=True) if job: job.save() datafile.save()
def importDataFile(datafile, job_id=None): from NMTK_server import models datafile.status_message=None try: loader=NMTKDataLoader(datafile.file.path, srid=datafile.srid) if loader.is_spatial: datafile.srid=loader.info.srid datafile.srs=loader.info.srs datafile.geom_type=loader.info.type logger.debug('Loader extent is %s', loader.info.extent) extent=geos.Polygon.from_bbox(loader.info.extent) logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid, extent,) if datafile.srid: extent.srid=int(loader.info.srid) extent.transform(4326) logger.debug("Extent is 'srid=%s;%s'::geometry", 4326, extent,) datafile.extent=extent datafile.feature_count=loader.info.feature_count if loader.is_spatial and not datafile.srid: datafile.status=datafile.IMPORT_FAILED datafile.status_message='Please specify SRID for this file (unable to auto-identify SRID)' elif not job_id: datafile.status=datafile.IMPORTED else: datafile.status=datafile.IMPORT_RESULTS_COMPLETE datafile.fields=loader.info.fields # Create an empty file using ContentFile, then we can overwrite it # with the desired GeoJSON data. if loader.is_spatial: suffix='geojson' else: suffix='json' if datafile.status in (datafile.IMPORTED, datafile.IMPORT_RESULTS_COMPLETE): datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix), ContentFile('')) loader.export_json(datafile.processed_file.path) generate_datamodel(datafile, loader) # Here we load the spatialite data using the model that was created # by generate_datamodel. We need to use this to get the range # and type information for each field... try: field_attributes={} qs=getQuerySet(datafile) field_mappings=[(django_model_fields.IntegerField, 'integer',), (django_model_fields.AutoField, 'integer',), # Required because nmtk_id is an autofield.. (django_model_fields.BooleanField, 'boolean',), (django_model_fields.DecimalField, 'float',), # Special case holding FIPS (django_model_fields.TextField, 'text',), (django_model_fields.FloatField,'float'), (django_model_fields.DateField, 'date',), (django_model_fields.TimeField, 'time'), (django_model_fields.DateTimeField, 'datetime')] if qs.count() > 0: # Get a single row so that we can try to work with the fields. sample_row=qs[0] for field in sample_row._meta.fields: field_name=field.name db_column=field.db_column or field.name # convert the django field type to a text string. for ftype, field_type in field_mappings: if isinstance(field, (ftype,)): break else: logger.info('Unable to map field of type %s (this is expected for GIS fields)', type(field,)) continue values_aggregates=qs.aggregate(Count(field_name, distinct=True)) field_attributes[db_column]={'type': field_type, 'field_name': field_name, 'distinct': values_aggregates['{0}__count'.format(field_name)]} if field_attributes[db_column]['distinct'] < 10: distinct_values=list(qs.order_by().values_list(field_name, flat=True).distinct()) field_attributes[db_column]['values']=distinct_values else: logger.debug('There are more than 10 values for %s (%s), enumerating..', db_column, field_attributes[db_column]['distinct']) # formerly the aggregates happened above - with the count. However, Django doesn't # allow those aggregates with boolean fields - so here we split it up to only do the # aggregates in the cases where we have to (i.e., the distinct values is above the threshold.) values_aggregates=qs.aggregate(Max(field_name), Min(field_name), ) field_attributes[db_column]['min']= values_aggregates['{0}__min'.format(field_name)] field_attributes[db_column]['max']= values_aggregates['{0}__max'.format(field_name)] datafile.field_attributes=field_attributes except Exception, e: logger.exception('Failed to get range for model %s', datafile.pk) if job_id: try: job=models.Job.objects.get(pk=job_id) # There might be multiple results files from this job, so we will only # mark the job as complete if all the results files are processed. if job.status != job.COMPLETE: results_left=job.job_files.filter(status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status=job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job).save() elif results_left == 1: # Handle the potential race condition here - do we really need this? # sort of. Since it's possible that two files finish post-processing # at the same time. In such cases, a second should be more than enough # time to get both committed as complete. time.sleep(1) job=models.Job.objects.get(pk=job_id) if job.status != job.COMPLETE: results_left=job.job_files.filter(status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status=job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job).save() except: logger.exception('Failed to update job status to complete?!!')
def importDataFile(datafile, job_id=None): from NMTK_server import models logger = importDataFile.get_logger() datafile.status_message = None job = None try: loader = NMTKDataLoader(datafile.file.path, srid=datafile.srid, logger=logger) destination = None for import_file in loader.extract_files(): # Figure out where these files need to go. if not destination: destination = os.path.dirname(datafile.file.path) # the first file we get (when destination is null,it's our first # loop) is the one that needs to be in the model, handle that # here... if datafile.file.path != import_file: f = open(import_file) datafile.file.save(os.path.basename(import_file), File(f)) else: shutil.copyfile( import_file, os.path.join(destination, os.path.basename(import_file))) logger.debug('Created a new file for %s', import_file) logger.info('The file is spatial? %s', loader.is_spatial) if loader.is_spatial: datafile.srid = loader.info.srid datafile.srs = loader.info.srs datafile.geom_type = loader.info.type logger.debug('Loader extent is %s', loader.info.extent) extent = geos.Polygon.from_bbox(loader.info.extent) logger.debug( "Extent is 'srid=%s;%s'::geometry", loader.info.srid, extent, ) if datafile.srid: extent.srid = int(loader.info.srid) extent.transform(4326) logger.debug( "Extent is 'srid=%s;%s'::geometry", 4326, extent, ) datafile.extent = extent datafile.feature_count = loader.info.feature_count if not datafile.description: datafile.description = loader.info.format future_status = datafile.status if loader.is_spatial and not datafile.srid: future_status = datafile.IMPORT_FAILED datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)' elif not job_id: future_status = datafile.IMPORTED else: future_status = datafile.IMPORT_RESULTS_COMPLETE # We need to merge these things.. desired_field_order = datafile.fields or [] # Now that we have a desired field order from the model, we can # go the next step of getting job data. if job_id: try: job = models.Job.objects.select_related('tool').get(pk=job_id) except Exception as e: logger.error('Failed to get job with id of %s', job_id, exc_info=True) # From the job data we can get the tool config: config_field_list = config_namespace = None # Get the list of field names, with the unique ones first... tool_config_field_units = {} job_config_field_units = datafile.units or {} if job: tool_config = job.tool.toolconfig.json_config # there might be multiple input files, but we'll use the first # one as the basis for format for the output, since we don't # really have a better option. The tool developer ought to # specify a list of fields in the output if they don't like # this behaviour, since this is just a "default" for the order. for t in job.tool.toolconfig.json_config['input']: if t.get('type', '').lower() == 'file': config_namespace = t.get('name', None) if config_namespace: config_field_list = [ f['name'] for f in t.get('elements', []) if isinstance(f.get('name', None), (str, unicode)) ] # If there are units, then we store the units # here, so we can use that with the field data. for f in t.get('elements', []): if 'units' in f: tool_config_field_units[f['name']] = f.get( 'units', None) elif 'description' in f: tool_config_field_units[f['name']] = f.get( 'description', None) break # Now that we have a list of fields from the tool configuration, # get the input fields from the file for each of the tool fields, # since we want that to be the default order of output. if config_field_list: job_config = job.config[config_namespace] for f in config_field_list: if f in job_config: if job_config[f].get('type', None) == 'property': if isinstance(job_config[f].get('value', None), (str, unicode)): desired_field_order.append( job_config[f]['value']) # Map the tool config field (f) to the selected data file field # (job_config[f]['value'] so we can grab the units from the # tool config. if (datafile.units and f in datafile.units and 'value' in job_config[f]): job_config_field_units[job_config[f][ 'value']] = datafile.units.get(f, '') # If the tool didn't give us the units to use for fields # we can fall back to the tool config to see what they # ought to be. elif (f in tool_config_field_units and 'value' in job_config[f]): job_config_field_units[job_config[f][ 'value']] = tool_config_field_units.get( f, '') # Get the list of actual fields in the input datafile... available_fields = loader.info.fields # eliminate fields that are not in the list of output fields. logger.debug('Desired field order is: %s', desired_field_order) logger.debug('Loader provided field order is: %s', available_fields) ordered_fields = [ field for field in desired_field_order if field in available_fields ] # Add in any fields using the order first, then following with # any fields not in the ordered list, but in the output list # of fields. datafile.fields = list( unique_everseen(ordered_fields + available_fields)) logger.debug('Final field order is %s', datafile.fields) # Create an empty file using ContentFile, then we can overwrite it # with the desired GeoJSON data. if loader.is_spatial: suffix = 'geojson' else: suffix = 'json' if future_status in (datafile.IMPORTED, datafile.IMPORT_RESULTS_COMPLETE): if datafile.geom_type == 99: field_attributes = {} # This is a raster... for pos, band in enumerate(loader.dl_instance.bands()): field_attributes[pos + 1] = { 'type': band.type, 'field_name': 'pixel', 'min': band.min, 'max': band.max } datafile.field_attributes = field_attributes elif datafile.feature_count: logger.error('Working on saving the model!') datafile.processed_file.save( '{0}.{1}'.format(datafile.pk, suffix), ContentFile('')) loader.export_json(datafile.processed_file.path) try: generate_datamodel(datafile, loader, logger) except Exception as e: logger.error('Error generating data model: %s', e, exc_info=logger.isEnabledFor(logging.DEBUG)) raise e # Here we load the spatialite data using the model that was created # by generate_datamodel. We need to use this to get the range # and type information for each field... try: field_attributes = {} qs = getQuerySet(datafile) field_mappings = [ (django_model_fields.IntegerField, 'integer', int), # Required because nmtk_id is an # autofield.. ( django_model_fields.AutoField, 'integer', int, ), (django_model_fields.BooleanField, 'boolean', bool), # Special case holding FIPS (django_model_fields.DecimalField, 'float', float), (django_model_fields.TextField, 'text', None), (django_model_fields.FloatField, 'float', float), ( django_model_fields.DateField, 'date', datetime.date.isoformat, ), ( django_model_fields.TimeField, 'time', datetime.time.isoformat, ), (django_model_fields.DateTimeField, 'datetime', datetime.datetime.isoformat) ] if qs.count() > 0: # Get a single row so that we can try to work with the # fields. sample_row = qs[0] for field in sample_row._meta.fields: field_name = field.name db_column = field.db_column or field.name # convert the django field type to a text string. for ftype, field_type, caster in field_mappings: if isinstance(field, (ftype, )): break else: logger.info( 'Unable to map field of type %s (this is expected for GIS fields)', type(field, )) continue values_aggregates = qs.aggregate( Count(field_name, distinct=True)) field_attributes[db_column] = { 'type': field_type, 'field_name': field_name, 'distinct': values_aggregates['{0}__count'.format( field_name)] } # Add the units from the config to the data. if db_column in job_config_field_units: field_attributes[db_column][ 'units'] = job_config_field_units[ db_column] if field_attributes[db_column]['distinct'] < 10: distinct_values = [ v for v in qs.order_by().values_list( field_name, flat=True).distinct() if v is not None ] if not caster: field_attributes[db_column][ 'values'] = distinct_values else: logger.info( 'Attempting to cast values: %s', distinct_values) field_attributes[db_column][ 'values'] = map( caster, distinct_values) else: logger.debug( 'There are more than 10 values for %s (%s), enumerating..', db_column, field_attributes[db_column]['distinct']) # formerly the aggregates happened above - with the count. However, Django doesn't # allow those aggregates with boolean fields - so here we split it up to only do the # aggregates in the cases where we have to (i.e., # the distinct values is above the threshold.) values_aggregates = qs.aggregate( Max(field_name), Min(field_name), ) field_attributes[db_column][ 'min'] = values_aggregates[ '{0}__min'.format(field_name)] field_attributes[db_column][ 'max'] = values_aggregates[ '{0}__max'.format(field_name)] if caster: field_attributes[db_column][ 'min'] = caster( field_attributes[db_column]['min']) field_attributes[db_column][ 'max'] = caster( field_attributes[db_column]['max']) datafile.field_attributes = field_attributes datafile.units = job_config_field_units except Exception as e: logger.exception('Failed to get range for model %s', datafile.pk) if job: try: # There might be multiple results files from this job, so we will only # mark the job as complete if all the results files are # processed. if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus( message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus.CATEGORY_SYSTEM).save() elif results_left == 1: # Handle the potential race condition here - do we really need this? # sort of. Since it's possible that two files finish post-processing # at the same time. In such cases, a second should be more than enough # time to get both committed as complete. time.sleep(1) job = models.Job.objects.get(pk=job_id) if job.status != job.COMPLETE: results_left = job.job_files.filter( status=models.DataFile.PROCESSING_RESULTS ).count() if results_left == 0: job.status = job.COMPLETE models.JobStatus(message='Job Completed', timestamp=timezone.now(), job=job, category=models.JobStatus. CATEGORY_SYSTEM).save() except: logger.exception('Failed to update job status to complete?!!') datafile.status = future_status except Exception as e: logger.error('Failed import process!', exc_info=True) datafile.processed_file = None if not job_id: datafile.status = datafile.IMPORT_FAILED else: datafile.status = datafile.IMPORT_RESULTS_FAILED datafile.status_message = "%s" % (e, ) if job_id: try: if not job: job = models.Job.objects.get(pk=job_id) job.status = job.POST_PROCESSING_FAILED logger.info('Set post processing to failed for job %s', job.pk) except: logger.error('Failed to update job status to failed?!!', exc_info=True) if job: job.save() datafile.save()