def run(self): while True: try: next_task = self.task_queue.get(True, 10) except Queue.Empty: logger.info("No tasks left to complete, closing %s" % self.name) break else: answer = (None, None) try: answer = (1, next_task(self.name, self.active)) except Exception as detail: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("Disabling Error: " +\ repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) if isinstance(next_task, DataController): answer = (-2, "DataController") # Tell the particles that the DataController is releasing file self.get_data.value = False # The data controller has died, so don't process any more tasks self.active.value = False elif isinstance(next_task, ForceParticle): answer = (-1, next_task.part) else: logger.warn("Strange task raised an exception: %s" % str(next_task.__class__)) answer = (None, None) finally: self.result_queue.put(answer) self.nproc_lock.acquire() self.n_run.value = self.n_run.value - 1 self.nproc_lock.release() self.task_queue.task_done()
def open(ncfile, xname='lon', yname='lat', zname='z', tname='time', **kwargs): """ Initialize paegan dataset object, which uses specific readers for different kinds of datasets, and returns dataset objects that expose a common api. from cdm.dataset import CommonDataset >> dataset = CommonDataset.open(ncfile) >> dataset = CommonDataset.open(url, "lon_rho", "lat_rho", "s_rho", "ocean_time") >> dataset = CommonDataset.open(url, dataset_type="cgrid") """ nc = None filename = None if isinstance(ncfile, str): ncfile = unicode(ncfile.strip()) if isinstance(ncfile, unicode): try: nc = netCDF4.Dataset(ncfile) filename = ncfile except StandardError: logger.error(ncfile) raise elif isinstance(ncfile, Dataset): # Passed in paegan Dataset object nc = ncfile.nc elif isinstance(ncile, netCDF4.Dataset): # Passed in a netCDF4 Dataset object nc = ncfile datasettype = kwargs.get('dataset_type', None) # Find the coordinate variables for testing, unknown if not found keys = set(nc.variables) posx = set(_possiblex) posy = set(_possibley) xmatches = list(posx.intersection(keys)) ymatches = list(posy.intersection(keys)) if xname in keys and yname in keys: testvary = nc.variables[yname] testvarx = nc.variables[xname] elif len(xmatches) > 0: testvary = nc.variables[ymatches[0]] testvarx = nc.variables[xmatches[0]] # Test the shapes of the coordinate variables to determine the grid type if datasettype is None: if testvary.ndim > 1: datasettype = "cgrid" else: if testvary.shape[0] != testvarx.shape[0]: datasettype = "rgrid" else: if "cdm_data_type" in nc.ncattrs(): if nc.cdm_data_type.lower() == "grid": datasettype = "rgrid" else: datasettype = "ncell" else: datasettype = "ncell" nc.close() # Return appropriate dataset subclass based on datasettype if datasettype == 'ncell': dataobj = NCellDataset(filename, datasettype, zname=zname, tname=tname, xname=xname, yname=yname) elif datasettype == 'rgrid': dataobj = RGridDataset(filename, datasettype, zname=zname, tname=tname, xname=xname, yname=yname) elif datasettype == 'cgrid': dataobj = CGridDataset(filename, datasettype, zname=zname, tname=tname, xname=xname, yname=yname) else: dataobj = None return dataobj
def __call__(self, active): c = 0 self.dataset = CommonDataset.open(self.hydrodataset) self.remote = self.dataset.nc # Calculate the datetimes of the model timesteps like # the particle objects do, so we can figure out unique # time indices modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps( self.times, start=self.start_time) timevar = self.dataset.gettimevar(self.uname) # Don't need to grab the last datetime, as it is not needed for forcing, only # for setting the time of the final particle forcing time_indexs = timevar.nearest_index(newtimes[0:-1], select='before') # Have to make sure that we get the plus 1 for the # linear interpolation of u,v,w,temp,salt self.inds = np.unique(time_indexs) self.inds = np.append(self.inds, self.inds.max() + 1) # While there is at least 1 particle still running, # stay alive, if not break while self.n_run.value > 1: if self.caching is False: logger.debug( "Caching is False, not doing much. Just hanging out until all of the particles finish." ) timer.sleep(10) continue # If particle asks for data, do the following if self.get_data.value is True: logger.debug("Particle asked for data!") # Wait for particles to get out while True: self.read_lock.acquire() logger.debug("Read count: %d" % self.read_count.value) if self.read_count.value > 0: logger.debug( "Waiting for write lock on cache file (particles must stop reading)..." ) self.read_lock.release() timer.sleep(2) else: break # Get write lock on the file. Already have read lock. self.write_lock.acquire() self.has_write_lock.value = os.getpid() if c == 0: logger.debug("Creating cache file") try: # Open local cache for writing, overwrites # existing file with same name self.local = netCDF4.Dataset(self.cache_path, 'w') indices = self.dataset.get_indices( self.uname, timeinds=[np.asarray([0])], point=self.start) self.point_get.value = [ self.inds[0], indices[-2], indices[-1] ] # Create dimensions for u and v variables self.local.createDimension('time', None) self.local.createDimension('level', None) self.local.createDimension('x', None) self.local.createDimension('y', None) # Create 3d or 4d u and v variables if self.remote.variables[self.uname].ndim == 4: self.ndim = 4 dimensions = ('time', 'level', 'y', 'x') coordinates = "time z lon lat" elif self.remote.variables[self.uname].ndim == 3: self.ndim = 3 dimensions = ('time', 'y', 'x') coordinates = "time lon lat" shape = self.remote.variables[self.uname].shape # If there is no FillValue defined in the dataset, use np.nan. # Sometimes it will work out correctly and other times we will # have a huge cache file. try: fill = self.remote.variables[ self.uname].missing_value except Exception: fill = np.nan # Create domain variable that specifies # where there is data geographically/by time # and where there is not data, # Used for testing if particle needs to # ask cache to update domain = self.local.createVariable('domain', 'i', dimensions, zlib=False, fill_value=0) domain.coordinates = coordinates # Create local u and v variables u = self.local.createVariable('u', 'f', dimensions, zlib=False, fill_value=fill) v = self.local.createVariable('v', 'f', dimensions, zlib=False, fill_value=fill) v.coordinates = coordinates u.coordinates = coordinates localvars = [ u, v, ] remotevars = [ self.remote.variables[self.uname], self.remote.variables[self.vname] ] # Create local w variable if self.wname is not None: w = self.local.createVariable('w', 'f', dimensions, zlib=False, fill_value=fill) w.coordinates = coordinates localvars.append(w) remotevars.append( self.remote.variables[self.wname]) if self.temp_name is not None and self.salt_name is not None: # Create local temp and salt vars temp = self.local.createVariable('temp', 'f', dimensions, zlib=False, fill_value=fill) salt = self.local.createVariable('salt', 'f', dimensions, zlib=False, fill_value=fill) temp.coordinates = coordinates salt.coordinates = coordinates localvars.append(temp) localvars.append(salt) remotevars.append( self.remote.variables[self.temp_name]) remotevars.append( self.remote.variables[self.salt_name]) # Create local lat/lon coordinate variables if self.remote.variables[self.xname].ndim == 2: lon = self.local.createVariable('lon', 'f', ("y", "x"), zlib=False) lon[:] = self.remote.variables[self.xname][:, :] lat = self.local.createVariable('lat', 'f', ("y", "x"), zlib=False) lat[:] = self.remote.variables[self.yname][:, :] if self.remote.variables[self.xname].ndim == 1: lon = self.local.createVariable('lon', 'f', ("x"), zlib=False) lon[:] = self.remote.variables[self.xname][:] lat = self.local.createVariable('lat', 'f', ("y"), zlib=False) lat[:] = self.remote.variables[self.yname][:] # Create local z variable if self.zname is not None: if self.remote.variables[self.zname].ndim == 4: z = self.local.createVariable( 'z', 'f', ("time", "level", "y", "x"), zlib=False) remotez = self.remote.variables[self.zname] localvars.append(z) remotevars.append(remotez) elif self.remote.variables[self.zname].ndim == 3: z = self.local.createVariable( 'z', 'f', ("level", "y", "x"), zlib=False) z[:] = self.remote.variables[ self.zname][:, :, :] elif self.remote.variables[self.zname].ndim == 1: z = self.local.createVariable('z', 'f', ("level", ), zlib=False) z[:] = self.remote.variables[self.zname][:] # Create local time variable time = self.local.createVariable('time', 'f8', ("time", ), zlib=False) if self.tname is not None: time[:] = self.remote.variables[self.tname][ self.inds] if self.point_get.value[0] + self.time_size > np.max( self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds) + 1) else: current_inds = np.arange( self.point_get.value[0], self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache. # Try 20 times on the first attempt current_attempt = 1 max_attempts = 20 while True: try: assert current_attempt <= max_attempts self.get_remote_data(localvars, remotevars, current_inds, shape) except AssertionError: raise except: logger.warn( "CachingDataController failed to get remote data. Trying again in 20 seconds. %s attempts left." % str(max_attempts - current_attempt)) logger.exception("Data Access Error") timer.sleep(20) current_attempt += 1 else: break c += 1 except (Exception, AssertionError): logger.error( "CachingDataController failed to get data (first request)" ) raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug( "Done updating cache file, closing file, and releasing locks" ) else: logger.debug("Updating cache file") try: # Open local cache dataset for appending self.local = netCDF4.Dataset(self.cache_path, 'a') # Create local and remote variable objects # for the variables of interest u = self.local.variables['u'] v = self.local.variables['v'] time = self.local.variables['time'] remoteu = self.remote.variables[self.uname] remotev = self.remote.variables[self.vname] # Create lists of variable objects for # the data updater localvars = [ u, v, ] remotevars = [ remoteu, remotev, ] if self.salt_name is not None and self.temp_name is not None: salt = self.local.variables['salt'] temp = self.local.variables['temp'] remotesalt = self.remote.variables[self.salt_name] remotetemp = self.remote.variables[self.temp_name] localvars.append(salt) localvars.append(temp) remotevars.append(remotesalt) remotevars.append(remotetemp) if self.wname is not None: w = self.local.variables['w'] remotew = self.remote.variables[self.wname] localvars.append(w) remotevars.append(remotew) if self.zname is not None: remotez = self.remote.variables[self.zname] if remotez.ndim == 4: z = self.local.variables['z'] localvars.append(z) remotevars.append(remotez) if self.tname is not None: # remotetime = self.remote.variables[self.tname] time[self.inds] = self.remote.variables[self.inds] if self.point_get.value[0] + self.time_size > np.max( self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds) + 1) else: current_inds = np.arange( self.point_get.value[0], self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn( "CachingDataController failed to get remote data. Trying again in 30 seconds" ) timer.sleep(30) else: break c += 1 except Exception: logger.error( "CachingDataController failed to get data (not first request)" ) raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug( "Done updating cache file, closing file, and releasing locks" ) else: logger.debug( "Particles are still running, waiting for them to request data..." ) timer.sleep(2) self.dataset.closenc() return "CachingDataController"
def run(self, hydrodataset, **kwargs): # Add ModelController description to logfile logger.info(self) # Add the model descriptions to logfile for m in self._models: logger.info(m) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. times = range(0, (self._step * self._nstep) + 1, self._step) # Calculate a datetime object for each model timestep # This method is duplicated in DataController and ForceParticle # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps( times, start=self.start) time_chunk = self._time_chunk horiz_chunk = self._horiz_chunk low_memory = kwargs.get("low_memory", False) # Should we remove the cache file at the end of the run? remove_cache = kwargs.get("remove_cache", True) self.bathy_path = kwargs.get("bathy", None) self.cache_path = kwargs.get("cache", None) if self.cache_path is None: # Generate temp filename for dataset cache default_cache_dir = os.path.join(os.path.dirname(__file__), "_cache") temp_name = AsaRandom.filename(prefix=str( datetime.now().microsecond), suffix=".nc") self.cache_path = os.path.join(default_cache_dir, temp_name) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance( self.geometry, MultiPolygon): point_locations = [ Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points( goal=self._npart, polygon=self.geometry) ] # Initialize the particles logger.progress((2, "Initializing particles")) for x in xrange(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) # This is where it makes sense to implement the multiprocessing # looping for particles and models. Can handle each particle in # parallel probably. # # Get the number of cores (may take some tuning) and create that # many workers then pass particles into the queue for the workers mgr = multiprocessing.Manager() nproc = multiprocessing.cpu_count() - 1 if nproc <= 0: raise ValueError( "Model does not run using less than two CPU cores") # Each particle is a task, plus the DataController number_of_tasks = len(self.particles) + 1 # We need a process for each particle and one for the data controller nproc = min(number_of_tasks, nproc) # When a particle requests data data_request_lock = mgr.Lock() # PID of process with lock has_data_request_lock = mgr.Value('int', -1) nproc_lock = mgr.Lock() # Create the task queue for all of the particles and the DataController tasks = multiprocessing.JoinableQueue(number_of_tasks) # Create the result queue for all of the particles and the DataController results = mgr.Queue(number_of_tasks) # Create the shared state objects get_data = mgr.Value('bool', True) # Number of tasks n_run = mgr.Value('int', number_of_tasks) updating = mgr.Value('bool', False) # When something is reading from cache file read_lock = mgr.Lock() # list of PIDs that are reading has_read_lock = mgr.list() read_count = mgr.Value('int', 0) # When something is writing to the cache file write_lock = mgr.Lock() # PID of process with lock has_write_lock = mgr.Value('int', -1) point_get = mgr.Value('list', [0, 0, 0]) active = mgr.Value('bool', True) logger.progress((3, "Initializing and caching hydro model's grid")) try: ds = CommonDataset.open(hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") common_variables = self.get_common_variables_from_dataset(ds) logger.debug("Pickling time variable to disk for particles") timevar = ds.gettimevar(common_variables.get("u")) f, timevar_pickle_path = tempfile.mkstemp() os.close(f) f = open(timevar_pickle_path, "wb") pickle.dump(timevar, f) f.close() ds.closenc() except: logger.warn("Failed to access remote dataset %s" % hydrodataset) raise DataControllerError("Inaccessible DAP endpoint: %s" % hydrodataset) # Add data controller to the queue first so that it # can get the initial data and is not blocked logger.debug('Starting DataController') logger.progress((4, "Starting processes")) data_controller = parallel.DataController(hydrodataset, common_variables, n_run, get_data, write_lock, has_write_lock, read_lock, read_count, time_chunk, horiz_chunk, times, self.start, point_get, self.reference_location, low_memory=low_memory, cache=self.cache_path) tasks.put(data_controller) # Create DataController worker data_controller_process = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="DataController") data_controller_process.start() logger.debug('Adding %i particles as tasks' % len(self.particles)) for part in self.particles: forcing = parallel.ForceParticle( part, hydrodataset, common_variables, timevar_pickle_path, times, self.start, self._models, self.reference_location.point, self._use_bathymetry, self._use_shoreline, self._use_seasurface, get_data, n_run, read_lock, has_read_lock, read_count, point_get, data_request_lock, has_data_request_lock, reverse_distance=self.reverse_distance, bathy=self.bathy_path, shoreline_path=self.shoreline_path, cache=self.cache_path, time_method=self.time_method) tasks.put(forcing) # Create workers for the particles. procs = [ parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="ForceParticle-%d" % i) for i in xrange(nproc - 1) ] for w in procs: w.start() logger.debug('Started %s' % w.name) # Get results back from queue, test for failed particles return_particles = [] retrieved = 0. error_code = 0 logger.info("Waiting for %i particle results" % len(self.particles)) logger.progress((5, "Running model")) while retrieved < number_of_tasks: try: # Returns a tuple of code, result code, tempres = results.get(timeout=240) except Queue.Empty: # Poll the active processes to make sure they are all alive and then continue with loop if not data_controller_process.is_alive( ) and data_controller_process.exitcode != 0: # Data controller is zombied, kill off other processes. get_data.value == False results.put((-2, "DataController")) new_procs = [] old_procs = [] for p in procs: if not p.is_alive() and p.exitcode != 0: # Do what the Consumer would do if something finished. # Add something to results queue results.put((-3, "ZombieParticle")) # Decrement nproc (DataController exits when this is 0) with nproc_lock: n_run.value = n_run.value - 1 # Remove task from queue (so they can be joined later on) tasks.task_done() # Start a new Consumer. It will exit if there are no tasks available. np = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name=p.name) new_procs.append(np) old_procs.append(p) # Release any locks the PID had if p.pid in has_read_lock: with read_lock: read_count.value -= 1 has_read_lock.remove(p.pid) if has_data_request_lock.value == p.pid: has_data_request_lock.value = -1 try: data_request_lock.release() except: pass if has_write_lock.value == p.pid: has_write_lock.value = -1 try: write_lock.release() except: pass for p in old_procs: try: procs.remove(p) except ValueError: logger.warn( "Did not find %s in the list of processes. Continuing on." % p.name) for p in new_procs: procs.append(p) logger.warn( "Started a new consumer (%s) to replace a zombie consumer" % p.name) p.start() else: # We got one. retrieved += 1 if code == None: logger.warn("Got an unrecognized response from a task.") elif code == -1: logger.warn("Particle %s has FAILED!!" % tempres.uid) elif code == -2: error_code = code logger.warn( "DataController has FAILED!! Removing cache file so the particles fail." ) try: os.remove(self.cache_path) except OSError: logger.debug( "Could not remove cache file, it probably never existed" ) pass elif code == -3: error_code = code logger.info( "A zombie process was caught and task was removed from queue" ) elif isinstance(tempres, Particle): logger.info("Particle %d finished" % tempres.uid) return_particles.append(tempres) # We mulitply by 95 here to save 5% for the exporting logger.progress( (round((retrieved / number_of_tasks) * 90., 1), "Particle %d finished" % tempres.uid)) elif tempres == "DataController": logger.info("DataController finished") logger.progress((round((retrieved / number_of_tasks) * 90., 1), "DataController finished")) else: logger.info("Got a strange result on results queue") logger.info(str(tempres)) logger.info("Retrieved %i/%i results" % (int(retrieved), number_of_tasks)) if len(return_particles) != len(self.particles): logger.warn( "Some particles failed and are not included in the output") # The results queue should be empty at this point assert results.empty() is True # Should be good to join on the tasks now that the queue is empty logger.info("Joining the task queue") tasks.join() # Join all processes logger.info("Joining the processes") for w in procs + [data_controller_process]: # Wait 10 seconds w.join(10.) if w.is_alive(): # Process is hanging, kill it. logger.info( "Terminating %s forcefully. This should have exited itself." % w.name) w.terminate() logger.info('Workers complete') self.particles = return_particles # Remove Manager so it shuts down del mgr # Remove pickled timevar os.remove(timevar_pickle_path) # Remove the cache file if remove_cache is True: try: os.remove(self.cache_path) except OSError: logger.debug( "Could not remove cache file, it probably never existed") logger.progress((96, "Exporting results")) if len(self.particles) > 0: # If output_formats and path specified, # output particle run data to disk when completed if "output_formats" in kwargs: # Make sure output_path is also included if kwargs.get("output_path", None) != None: formats = kwargs.get("output_formats") output_path = kwargs.get("output_path") if isinstance(formats, list): for format in formats: logger.info("Exporting to: %s" % format) try: self.export(output_path, format=format) except: logger.error("Failed to export to: %s" % format) else: logger.warn( 'The output_formats parameter should be a list, not saving any output!' ) else: logger.warn( 'No output path defined, not saving any output!') else: logger.warn('No output format defined, not saving any output!') else: logger.warn("Model didn't actually do anything, check the log.") if error_code == -2: raise DataControllerError("Error in the DataController") else: raise ModelError("Error in the model") logger.progress((99, "Model Run Complete")) return
def __call__(self, proc, active): self.active = active if self.usebathy == True: self._bathymetry = Bathymetry(file=self.bathy) self._shoreline = None if self.useshore == True: self._shoreline = Shoreline(file=self.shoreline_path, point=self.release_location_centroid, spatialbuffer=0.25) # Make sure we are not starting on land. Raises exception if we are. self._shoreline.intersect(start_point=self.release_location_centroid, end_point=self.release_location_centroid) self.proc = proc part = self.part if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to start...") timer.sleep(10) pass # Initialize commondataset of local cache, then # close the related netcdf file try: with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) self.dataset = CommonDataset.open(self.localpath) self.dataset.closenc() except StandardError: logger.warn("No cache file: %s. Particle exiting" % self.localpath) raise finally: with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) # Calculate datetime at every timestep modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time) # Load Timevar from pickle serialization f = open(self.timevar_pickle_path,"rb") timevar = pickle.load(f) f.close() if self.time_method == 'interp': time_indexs = timevar.nearest_index(newtimes, select='before') elif self.time_method == 'nearest': time_indexs = timevar.nearest_index(newtimes) else: logger.warn("Method for computing u,v,w,temp,salt not supported!") try: assert len(newtimes) == len(time_indexs) except AssertionError: logger.error("Time indexes are messed up. Need to have equal datetime and time indexes") raise # loop over timesteps # We don't loop over the last time_index because # we need to query in the time_index and set the particle's # location as the 'newtime' object. for loop_i, i in enumerate(time_indexs[0:-1]): if self.active.value == False: raise ValueError("Particle exiting due to Failure.") newloc = None # if need a time that is outside of what we have #if self.active.value == True: # while self.get_data.value == True: # logger.info("Waiting for DataController to get out...") # timer.sleep(4) # pass # Get the variable data required by the models if self.time_method == 'nearest': u, v, w, temp, salt = self.data_nearest(i, newtimes[loop_i]) elif self.time_method == 'interp': u, v, w, temp, salt = self.data_interp(i, timevar, newtimes[loop_i]) else: logger.warn("Method for computing u,v,w,temp,salt not supported!") #logger.info("U: %.4f, V: %.4f, W: %.4f" % (u,v,w)) #logger.info("Temp: %.4f, Salt: %.4f" % (temp,salt)) # Get the bathy value at the particles location if self.usebathy == True: bathymetry_value = self._bathymetry.get_depth(part.location) else: bathymetry_value = -999999999999999 # Age the particle by the modelTimestep (seconds) # 'Age' meaning the amount of time it has been forced. part.age(seconds=modelTimestep[loop_i]) # loop over models - sort these in the order you want them to run for model in self.models: movement = model.move(part, u, v, w, modelTimestep[loop_i], temperature=temp, salinity=salt, bathymetry_value=bathymetry_value) newloc = Location4D(latitude=movement['latitude'], longitude=movement['longitude'], depth=movement['depth'], time=newtimes[loop_i+1]) logger.debug("%s - moved %.3f meters (horizontally) and %.3f meters (vertically) by %s with data from %s" % (part.logstring(), movement['distance'], movement['vertical_distance'], model.__class__.__name__, newtimes[loop_i].isoformat())) if newloc: self.boundary_interaction(particle=part, starting=part.location, ending=newloc, distance=movement['distance'], angle=movement['angle'], azimuth=movement['azimuth'], reverse_azimuth=movement['reverse_azimuth'], vertical_distance=movement['vertical_distance'], vertical_angle=movement['vertical_angle']) logger.debug("%s - was forced by %s and is now at %s" % (part.logstring(), model.__class__.__name__, part.location.logstring())) part.note = part.outputstring() # Each timestep, save the particles status and environmental variables. # This keep fields such as temp, salt, halted, settled, and dead matched up with the number of timesteps part.save() # We won't pull data for the last entry in locations, but we need to populate it with fill data. part.fill_environment_gap() if self.usebathy == True: self._bathymetry.close() if self.useshore == True: self._shoreline.close() return part
def data_nearest(self, i, currenttime): """ Method to streamline request for data from cache, Uses nearest time to get u,v,w,temp,salt """ if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to release cache file so I can read from it...") timer.sleep(4) pass if self.need_data(i): # Acquire lock for asking for data self.data_request_lock.acquire() self.has_data_request_lock.value = os.getpid() try: if self.need_data(i): with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) # Open netcdf file on disk from commondataset self.dataset.opennc() # Get the indices for the current particle location indices = self.dataset.get_indices('u', timeinds=[np.asarray([i-1])], point=self.part.location ) self.dataset.closenc() with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) # Override the time self.point_get.value = [indices[0]+1, indices[-2], indices[-1]] # Request that the data controller update the cache # DATA CONTOLLER STARTS self.get_data.value = True # Wait until the data controller is done if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to update cache...") timer.sleep(4) pass except StandardError: raise finally: self.has_data_request_lock.value = -1 self.data_request_lock.release() # Tell the DataController that we are going to be reading from the file with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) try: # Open netcdf file on disk from commondataset self.dataset.opennc() # Grab data at time index closest to particle location u = np.mean(np.mean(self.dataset.get_values('u', timeinds=[np.asarray([i])], point=self.part.location ))) v = np.mean(np.mean(self.dataset.get_values('v', timeinds=[np.asarray([i])], point=self.part.location ))) # if there is vertical velocity inthe dataset, get it if 'w' in self.dataset.nc.variables: w = np.mean(np.mean(self.dataset.get_values('w', timeindsf=[np.asarray([i])], point=self.part.location ))) else: w = 0.0 # If there is salt and temp in the dataset, get it if self.temp_name != None and self.salt_name != None: temp = np.mean(np.mean(self.dataset.get_values('temp', timeinds=[np.asarray([i])], point=self.part.location ))) salt = np.mean(np.mean(self.dataset.get_values('salt', timeinds=[np.asarray([i])], point=self.part.location ))) # Check for nans that occur in the ocean (happens because # of model and coastline resolution mismatches) if np.isnan(u).any() or np.isnan(v).any() or np.isnan(w).any(): # Take the mean of the closest 4 points # If this includes nan which it will, result is nan uarray1 = self.dataset.get_values('u', timeinds=[np.asarray([i])], point=self.part.location, num=2) varray1 = self.dataset.get_values('v', timeinds=[np.asarray([i])], point=self.part.location, num=2) if 'w' in self.dataset.nc.variables: warray1 = self.dataset.get_values('w', timeinds=[np.asarray([i])], point=self.part.location, num=2) w = warray1.mean() else: w = 0.0 if self.temp_name != None and self.salt_name != None: temparray1 = self.dataset.get_values('temp', timeinds=[np.asarray([i])], point=self.part.location, num=2) saltarray1 = self.dataset.get_values('salt', timeinds=[np.asarray([i])], point=self.part.location, num=2) temp = temparray1.mean() salt = saltarray1.mean() u = uarray1.mean() v = varray1.mean() if self.temp_name is None: temp = np.nan if self.salt_name is None: salt = np.nan #logger.info(self.dataset.get_xyind_from_point('u', self.part.location, num=1)) except StandardError: logger.error("Error in data_nearest on ForceParticle") raise finally: self.dataset.closenc() with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) return u, v, w, temp, salt
def data_interp(self, i, timevar, currenttime): """ Method to streamline request for data from cache, Uses linear interpolation bewtween timesteps to get u,v,w,temp,salt """ if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to release cache file so I can read from it...") timer.sleep(4) pass if self.need_data(i+1): # Acquire lock for asking for data self.data_request_lock.acquire() self.has_data_request_lock.value = os.getpid() try: # Do I still need data? if self.need_data(i+1): # Tell the DataController that we are going to be reading from the file with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) # Open netcdf file on disk from commondataset self.dataset.opennc() # Get the indices for the current particle location indices = self.dataset.get_indices('u', timeinds=[np.asarray([i-1])], point=self.part.location ) self.dataset.closenc() with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) # Override the time # get the current time index data self.point_get.value = [indices[0] + 1, indices[-2], indices[-1]] # Request that the data controller update the cache self.get_data.value = True # Wait until the data controller is done if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to update cache with the CURRENT time index") timer.sleep(4) pass # get the next time index data self.point_get.value = [indices[0] + 2, indices[-2], indices[-1]] # Request that the data controller update the cache self.get_data.value = True # Wait until the data controller is done if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to update cache with the NEXT time index") timer.sleep(4) pass except StandardError: logger.warn("Particle failed to request data correctly") raise finally: # Release lock for asking for data self.has_data_request_lock.value = -1 self.data_request_lock.release() # Tell the DataController that we are going to be reading from the file with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) try: # Open netcdf file on disk from commondataset self.dataset.opennc() # Grab data at time index closest to particle location u = [np.mean(np.mean(self.dataset.get_values('u', timeinds=[np.asarray([i])], point=self.part.location ))), np.mean(np.mean(self.dataset.get_values('u', timeinds=[np.asarray([i+1])], point=self.part.location )))] v = [np.mean(np.mean(self.dataset.get_values('v', timeinds=[np.asarray([i])], point=self.part.location ))), np.mean(np.mean(self.dataset.get_values('v', timeinds=[np.asarray([i+1])], point=self.part.location )))] # if there is vertical velocity inthe dataset, get it if 'w' in self.dataset.nc.variables: w = [np.mean(np.mean(self.dataset.get_values('w', timeinds=[np.asarray([i])], point=self.part.location ))), np.mean(np.mean(self.dataset.get_values('w', timeinds=[np.asarray([i+1])], point=self.part.location )))] else: w = [0.0, 0.0] # If there is salt and temp in the dataset, get it if self.temp_name != None and self.salt_name != None: temp = [np.mean(np.mean(self.dataset.get_values('temp', timeinds=[np.asarray([i])], point=self.part.location ))), np.mean(np.mean(self.dataset.get_values('temp', timeinds=[np.asarray([i+1])], point=self.part.location )))] salt = [np.mean(np.mean(self.dataset.get_values('salt', timeinds=[np.asarray([i])], point=self.part.location ))), np.mean(np.mean(self.dataset.get_values('salt', timeinds=[np.asarray([i+1])], point=self.part.location )))] # Check for nans that occur in the ocean (happens because # of model and coastline resolution mismatches) if np.isnan(u).any() or np.isnan(v).any() or np.isnan(w).any(): # Take the mean of the closest 4 points # If this includes nan which it will, result is nan uarray1 = self.dataset.get_values('u', timeinds=[np.asarray([i])], point=self.part.location, num=2) varray1 = self.dataset.get_values('v', timeinds=[np.asarray([i])], point=self.part.location, num=2) uarray2 = self.dataset.get_values('u', timeinds=[np.asarray([i+1])], point=self.part.location, num=2) varray2 = self.dataset.get_values('v', timeinds=[np.asarray([i+1])], point=self.part.location, num=2) if 'w' in self.dataset.nc.variables: warray1 = self.dataset.get_values('w', timeinds=[np.asarray([i])], point=self.part.location, num=2) warray2 = self.dataset.get_values('w', timeinds=[np.asarray([i+1])], point=self.part.location, num=2) w = [warray1.mean(), warray2.mean()] else: w = [0.0, 0.0] if self.temp_name != None and self.salt_name != None: temparray1 = self.dataset.get_values('temp', timeinds=[np.asarray([i])], point=self.part.location, num=2) saltarray1 = self.dataset.get_values('salt', timeinds=[np.asarray([i])], point=self.part.location, num=2) temparray2 = self.dataset.get_values('temp', timeinds=[np.asarray([i+1])], point=self.part.location, num=2) saltarray2 = self.dataset.get_values('salt', timeinds=[np.asarray([i+1])], point=self.part.location, num=2) temp = [temparray1.mean(), temparray2.mean()] salt = [saltarray1.mean(), saltarray2.mean()] u = [uarray1.mean(), uarray2.mean()] v = [varray1.mean(), varray2.mean()] # Linear interp of data between timesteps currenttime = date2num(currenttime) timevar = timevar.datenum u = self.linterp(timevar[i:i+2], u, currenttime) v = self.linterp(timevar[i:i+2], v, currenttime) w = self.linterp(timevar[i:i+2], w, currenttime) if self.temp_name != None and self.salt_name != None: temp = self.linterp(timevar[i:i+2], temp, currenttime) salt = self.linterp(timevar[i:i+2], salt, currenttime) if self.temp_name is None: temp = np.nan if self.salt_name is None: salt = np.nan #logger.info(self.dataset.get_xyind_from_point('u', self.part.location, num=1)) except StandardError: logger.error("Error in data_interp method on ForceParticle") raise finally: self.dataset.closenc() with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) return u, v, w, temp, salt
def __call__(self, proc, active): c = 0 self.dataset = CommonDataset.open(self.url) self.proc = proc self.remote = self.dataset.nc cachepath = self.cache_path # Calculate the datetimes of the model timesteps like # the particle objects do, so we can figure out unique # time indices modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time) timevar = self.dataset.gettimevar(self.uname) # Don't need to grab the last datetime, as it is not needed for forcing, only # for setting the time of the final particle forcing time_indexs = timevar.nearest_index(newtimes[0:-1], select='before') # Have to make sure that we get the plus 1 for the # linear interpolation of u,v,w,temp,salt self.inds = np.unique(time_indexs) self.inds = np.append(self.inds, self.inds.max()+1) # While there is at least 1 particle still running, # stay alive, if not break while self.n_run.value > 1: logger.debug("Particles are still running, waiting for them to request data...") timer.sleep(2) # If particle asks for data, do the following if self.get_data.value == True: logger.debug("Particle asked for data!") # Wait for particles to get out while True: self.read_lock.acquire() logger.debug("Read count: %d" % self.read_count.value) if self.read_count.value > 0: logger.debug("Waiting for write lock on cache file (particles must stop reading)...") self.read_lock.release() timer.sleep(4) else: break # Get write lock on the file. Already have read lock. self.write_lock.acquire() self.has_write_lock.value = os.getpid() if c == 0: logger.debug("Creating cache file") try: # Open local cache for writing, overwrites # existing file with same name self.local = netCDF4.Dataset(cachepath, 'w') indices = self.dataset.get_indices(self.uname, timeinds=[np.asarray([0])], point=self.start) self.point_get.value = [self.inds[0], indices[-2], indices[-1]] # Create dimensions for u and v variables self.local.createDimension('time', None) self.local.createDimension('level', None) self.local.createDimension('x', None) self.local.createDimension('y', None) # Create 3d or 4d u and v variables if self.remote.variables[self.uname].ndim == 4: self.ndim = 4 dimensions = ('time', 'level', 'y', 'x') coordinates = "time z lon lat" elif self.remote.variables[self.uname].ndim == 3: self.ndim = 3 dimensions = ('time', 'y', 'x') coordinates = "time lon lat" shape = self.remote.variables[self.uname].shape # If there is no FillValue defined in the dataset, use np.nan. # Sometimes it will work out correctly and other times we will # have a huge cache file. try: fill = self.remote.variables[self.uname].missing_value except Exception: fill = np.nan # Create domain variable that specifies # where there is data geographically/by time # and where there is not data, # Used for testing if particle needs to # ask cache to update domain = self.local.createVariable('domain', 'i', dimensions, zlib=False, fill_value=0) domain.coordinates = coordinates # Create local u and v variables u = self.local.createVariable('u', 'f', dimensions, zlib=False, fill_value=fill) v = self.local.createVariable('v', 'f', dimensions, zlib=False, fill_value=fill) v.coordinates = coordinates u.coordinates = coordinates localvars = [u, v,] remotevars = [self.remote.variables[self.uname], self.remote.variables[self.vname]] # Create local w variable if self.wname != None: w = self.local.createVariable('w', 'f', dimensions, zlib=False, fill_value=fill) w.coordinates = coordinates localvars.append(w) remotevars.append(self.remote.variables[self.wname]) if self.temp_name != None and self.salt_name != None: # Create local temp and salt vars temp = self.local.createVariable('temp', 'f', dimensions, zlib=False, fill_value=fill) salt = self.local.createVariable('salt', 'f', dimensions, zlib=False, fill_value=fill) temp.coordinates = coordinates salt.coordinates = coordinates localvars.append(temp) localvars.append(salt) remotevars.append(self.remote.variables[self.temp_name]) remotevars.append(self.remote.variables[self.salt_name]) # Create local lat/lon coordinate variables if self.remote.variables[self.xname].ndim == 2: lon = self.local.createVariable('lon', 'f', ("y", "x"), zlib=False) lon[:] = self.remote.variables[self.xname][:, :] lat = self.local.createVariable('lat', 'f', ("y", "x"), zlib=False) lat[:] = self.remote.variables[self.yname][:, :] if self.remote.variables[self.xname].ndim == 1: lon = self.local.createVariable('lon', 'f', ("x"), zlib=False) lon[:] = self.remote.variables[self.xname][:] lat = self.local.createVariable('lat', 'f', ("y"), zlib=False) lat[:] = self.remote.variables[self.yname][:] # Create local z variable if self.zname != None: if self.remote.variables[self.zname].ndim == 4: z = self.local.createVariable('z', 'f', ("time","level","y","x"), zlib=False) remotez = self.remote.variables[self.zname] localvars.append(z) remotevars.append(remotez) elif self.remote.variables[self.zname].ndim == 3: z = self.local.createVariable('z', 'f', ("level","y","x"), zlib=False) z[:] = self.remote.variables[self.zname][:, :, :] elif self.remote.variables[self.zname].ndim ==1: z = self.local.createVariable('z', 'f', ("level",), zlib=False) z[:] = self.remote.variables[self.zname][:] # Create local time variable time = self.local.createVariable('time', 'f8', ("time",), zlib=False) if self.tname != None: time[:] = self.remote.variables[self.tname][self.inds] if self.point_get.value[0]+self.time_size > np.max(self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1) else: current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn("DataController failed to get remote data. Trying again in 30 seconds") timer.sleep(30) else: break c += 1 except StandardError: logger.error("DataController failed to get data (first request)") raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug("Done updating cache file, closing file, and releasing locks") else: logger.debug("Updating cache file") try: # Open local cache dataset for appending self.local = netCDF4.Dataset(cachepath, 'a') # Create local and remote variable objects # for the variables of interest u = self.local.variables['u'] v = self.local.variables['v'] time = self.local.variables['time'] remoteu = self.remote.variables[self.uname] remotev = self.remote.variables[self.vname] # Create lists of variable objects for # the data updater localvars = [u, v, ] remotevars = [remoteu, remotev, ] if self.salt_name != None and self.temp_name != None: salt = self.local.variables['salt'] temp = self.local.variables['temp'] remotesalt = self.remote.variables[self.salt_name] remotetemp = self.remote.variables[self.temp_name] localvars.append(salt) localvars.append(temp) remotevars.append(remotesalt) remotevars.append(remotetemp) if self.wname != None: w = self.local.variables['w'] remotew = self.remote.variables[self.wname] localvars.append(w) remotevars.append(remotew) if self.zname != None: remotez = self.remote.variables[self.zname] if remotez.ndim == 4: z = self.local.variables['z'] localvars.append(z) remotevars.append(remotez) if self.tname != None: remotetime = self.remote.variables[self.tname] time[self.inds] = self.remote.variables[self.inds] if self.point_get.value[0]+self.time_size > np.max(self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1) else: current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn("DataController failed to get remote data. Trying again in 30 seconds") timer.sleep(30) else: break c += 1 except StandardError: logger.error("DataController failed to get data (not first request)") raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug("Done updating cache file, closing file, and releasing locks") else: pass self.dataset.closenc() return "DataController"
def run(self, hydrodataset, **kwargs): # Add ModelController description to logfile logger.info(self) # Add the model descriptions to logfile for m in self._models: logger.info(m) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. times = range(0,(self._step*self._nstep)+1,self._step) # Calculate a datetime object for each model timestep # This method is duplicated in DataController and ForceParticle # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(times, start=self.start) time_chunk = self._time_chunk horiz_chunk = self._horiz_chunk low_memory = kwargs.get("low_memory", False) # Should we remove the cache file at the end of the run? remove_cache = kwargs.get("remove_cache", True) self.bathy_path = kwargs.get("bathy", None) self.cache_path = kwargs.get("cache", None) if self.cache_path is None: # Generate temp filename for dataset cache default_cache_dir = os.path.join(os.path.dirname(__file__), "_cache") temp_name = AsaRandom.filename(prefix=str(datetime.now().microsecond), suffix=".nc") self.cache_path = os.path.join(default_cache_dir, temp_name) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon): point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)] # Initialize the particles logger.progress((2, "Initializing particles")) for x in xrange(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) # This is where it makes sense to implement the multiprocessing # looping for particles and models. Can handle each particle in # parallel probably. # # Get the number of cores (may take some tuning) and create that # many workers then pass particles into the queue for the workers mgr = multiprocessing.Manager() nproc = multiprocessing.cpu_count() - 1 if nproc <= 0: raise ValueError("Model does not run using less than two CPU cores") # Each particle is a task, plus the DataController number_of_tasks = len(self.particles) + 1 # We need a process for each particle and one for the data controller nproc = min(number_of_tasks, nproc) # When a particle requests data data_request_lock = mgr.Lock() # PID of process with lock has_data_request_lock = mgr.Value('int',-1) nproc_lock = mgr.Lock() # Create the task queue for all of the particles and the DataController tasks = multiprocessing.JoinableQueue(number_of_tasks) # Create the result queue for all of the particles and the DataController results = mgr.Queue(number_of_tasks) # Create the shared state objects get_data = mgr.Value('bool', True) # Number of tasks n_run = mgr.Value('int', number_of_tasks) updating = mgr.Value('bool', False) # When something is reading from cache file read_lock = mgr.Lock() # list of PIDs that are reading has_read_lock = mgr.list() read_count = mgr.Value('int', 0) # When something is writing to the cache file write_lock = mgr.Lock() # PID of process with lock has_write_lock = mgr.Value('int',-1) point_get = mgr.Value('list', [0, 0, 0]) active = mgr.Value('bool', True) logger.progress((3, "Initializing and caching hydro model's grid")) try: ds = CommonDataset.open(hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") common_variables = self.get_common_variables_from_dataset(ds) logger.debug("Pickling time variable to disk for particles") timevar = ds.gettimevar(common_variables.get("u")) f, timevar_pickle_path = tempfile.mkstemp() os.close(f) f = open(timevar_pickle_path, "wb") pickle.dump(timevar, f) f.close() ds.closenc() except: logger.warn("Failed to access remote dataset %s" % hydrodataset) raise DataControllerError("Inaccessible DAP endpoint: %s" % hydrodataset) # Add data controller to the queue first so that it # can get the initial data and is not blocked logger.debug('Starting DataController') logger.progress((4, "Starting processes")) data_controller = parallel.DataController(hydrodataset, common_variables, n_run, get_data, write_lock, has_write_lock, read_lock, read_count, time_chunk, horiz_chunk, times, self.start, point_get, self.reference_location, low_memory=low_memory, cache=self.cache_path) tasks.put(data_controller) # Create DataController worker data_controller_process = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="DataController") data_controller_process.start() logger.debug('Adding %i particles as tasks' % len(self.particles)) for part in self.particles: forcing = parallel.ForceParticle(part, hydrodataset, common_variables, timevar_pickle_path, times, self.start, self._models, self.reference_location.point, self._use_bathymetry, self._use_shoreline, self._use_seasurface, get_data, n_run, read_lock, has_read_lock, read_count, point_get, data_request_lock, has_data_request_lock, reverse_distance=self.reverse_distance, bathy=self.bathy_path, shoreline_path=self.shoreline_path, shoreline_feature=self.shoreline_feature, cache=self.cache_path, time_method=self.time_method) tasks.put(forcing) # Create workers for the particles. procs = [ parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="ForceParticle-%d"%i) for i in xrange(nproc - 1) ] for w in procs: w.start() logger.debug('Started %s' % w.name) # Get results back from queue, test for failed particles return_particles = [] retrieved = 0. error_code = 0 logger.info("Waiting for %i particle results" % len(self.particles)) logger.progress((5, "Running model")) while retrieved < number_of_tasks: try: # Returns a tuple of code, result code, tempres = results.get(timeout=240) except Queue.Empty: # Poll the active processes to make sure they are all alive and then continue with loop if not data_controller_process.is_alive() and data_controller_process.exitcode != 0: # Data controller is zombied, kill off other processes. get_data.value == False results.put((-2, "DataController")) new_procs = [] old_procs = [] for p in procs: if not p.is_alive() and p.exitcode != 0: # Do what the Consumer would do if something finished. # Add something to results queue results.put((-3, "ZombieParticle")) # Decrement nproc (DataController exits when this is 0) with nproc_lock: n_run.value = n_run.value - 1 # Remove task from queue (so they can be joined later on) tasks.task_done() # Start a new Consumer. It will exit if there are no tasks available. np = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name=p.name) new_procs.append(np) old_procs.append(p) # Release any locks the PID had if p.pid in has_read_lock: with read_lock: read_count.value -= 1 has_read_lock.remove(p.pid) if has_data_request_lock.value == p.pid: has_data_request_lock.value = -1 try: data_request_lock.release() except: pass if has_write_lock.value == p.pid: has_write_lock.value = -1 try: write_lock.release() except: pass for p in old_procs: try: procs.remove(p) except ValueError: logger.warn("Did not find %s in the list of processes. Continuing on." % p.name) for p in new_procs: procs.append(p) logger.warn("Started a new consumer (%s) to replace a zombie consumer" % p.name) p.start() else: # We got one. retrieved += 1 if code == None: logger.warn("Got an unrecognized response from a task.") elif code == -1: logger.warn("Particle %s has FAILED!!" % tempres.uid) elif code == -2: error_code = code logger.warn("DataController has FAILED!! Removing cache file so the particles fail.") try: os.remove(self.cache_path) except OSError: logger.debug("Could not remove cache file, it probably never existed") pass elif code == -3: error_code = code logger.info("A zombie process was caught and task was removed from queue") elif isinstance(tempres, Particle): logger.info("Particle %d finished" % tempres.uid) return_particles.append(tempres) # We mulitply by 95 here to save 5% for the exporting logger.progress((round((retrieved / number_of_tasks) * 90.,1), "Particle %d finished" % tempres.uid)) elif tempres == "DataController": logger.info("DataController finished") logger.progress((round((retrieved / number_of_tasks) * 90.,1), "DataController finished")) else: logger.info("Got a strange result on results queue") logger.info(str(tempres)) logger.info("Retrieved %i/%i results" % (int(retrieved),number_of_tasks)) if len(return_particles) != len(self.particles): logger.warn("Some particles failed and are not included in the output") # The results queue should be empty at this point assert results.empty() is True # Should be good to join on the tasks now that the queue is empty logger.info("Joining the task queue") tasks.join() # Join all processes logger.info("Joining the processes") for w in procs + [data_controller_process]: # Wait 10 seconds w.join(10.) if w.is_alive(): # Process is hanging, kill it. logger.info("Terminating %s forcefully. This should have exited itself." % w.name) w.terminate() logger.info('Workers complete') self.particles = return_particles # Remove Manager so it shuts down del mgr # Remove pickled timevar os.remove(timevar_pickle_path) # Remove the cache file if remove_cache is True: try: os.remove(self.cache_path) except OSError: logger.debug("Could not remove cache file, it probably never existed") logger.progress((96, "Exporting results")) if len(self.particles) > 0: # If output_formats and path specified, # output particle run data to disk when completed if "output_formats" in kwargs: # Make sure output_path is also included if kwargs.get("output_path", None) != None: formats = kwargs.get("output_formats") output_path = kwargs.get("output_path") if isinstance(formats, list): for format in formats: logger.info("Exporting to: %s" % format) try: self.export(output_path, format=format) except: logger.error("Failed to export to: %s" % format) else: logger.warn('The output_formats parameter should be a list, not saving any output!') else: logger.warn('No output path defined, not saving any output!') else: logger.warn('No output format defined, not saving any output!') else: logger.warn("Model didn't actually do anything, check the log.") if error_code == -2: raise DataControllerError("Error in the DataController") else: raise ModelError("Error in the model") logger.progress((99, "Model Run Complete")) return