def run_step(step, context): """Run a given step. 1. Starts a process to run the next step. 2. Creates a queue to communicate with the process. 3. Changes the state of the Step to Step.RUN. """ log_step(logging.debug, step, 'Preparing objects to run.') step.prompt_queue = ProcessQueue() step.input_queue = ProcessQueue() step.output_queue = ProcessQueue() step.result_queue = ProcessQueue() # Reset some attributes in-case the Step is being re-run. # Older values present can be confusing to the user, so remove them. step.prompt_messages = [] step.input_messages = [] step.return_value = None trail_environment = TrailEnvironment(step.prompt_queue, step.input_queue, step.output_queue) step.process = Process(target=step_manager, args=(step, trail_environment, context)) log_step(logging.debug, step, 'Starting subprocess to run step.') step.process.start() step.state = step.RUN
def __init__(self, lane_departure_callback): self.lane_departure_callback = lane_departure_callback self.running = False self.out_queue = ProcessQueue() self.in_queue = ProcessQueue() self.terminate_event = ProcessEvent() self.lane_process = Process(target=self.LaneProcess) self.update_thread = threading.Thread(target=self.update_thread) self.running = True self.lane_process.start() self.update_thread.start()
def __init__(self, total, index): """ :param total: 总worker数量 :param index: 当前worker index """ self.queue = ProcessQueue(self.QUEUE_LEN) self.process = Process(target=self.run_forever, args=()) self.total, self.index = total, index
def parallel_mode(settings): num_workers = settings["num_workers"] print("Initializing %d workers" % (num_workers, )) to_ps = ProcessQueue() from_ps = [ProcessQueue() for _ in range(num_workers)] ps = Process(target=parameter_server_process, args=(savedir, settings, to_ps, from_ps)) workers = [] for i in range(num_workers): workers.append( Process(target=worker_process, args=(settings, to_ps, from_ps[i]))) ps.start() for worker in workers: worker.start()
def _gen_processes(self, simulator_class, configuration, generation_count): queues = [ProcessQueue() for _ in range(len(self._simulation_states))] simulators = [ Process(target=_run_simulation, args=(queue, simulator_class, configuration, state, generation_count, i)) for i, (queue, state) in enumerate(zip(queues, self._simulation_states)) ] return queues, simulators
def create_stream_listener(stream): """Runs listeners to tail STDOUT and STDERR. When the shell command is run with Popen, we need a way to asynchronously and non-blockingly read STDOUT and STDERR. This is achieved by running the stream_reader function as a subprocess. Each such instance is called a listener. This function creates and runs such listeners: Arguments: stream -- A stream to read from. Like subprocess.PIPE. Must support readline() method. Returns: A tuple of the form: (listener_process, queue) Where: listener_process -- A multiprocessing.Process object referring to the listener subprocess. This is needed to terminate the listener since the listener contains no termination logic. queue -- A multiprocessing.Queue object into which the listener will be writing messages from the stream to. This conversion from a stream like object to a queue like object allows one to read in a non-blocking manner. """ queue = ProcessQueue() listener_process = Process(target=stream_reader, args=(stream, queue)) listener_process.start() return (listener_process, queue)
def read_cache(cache, channel, start=None, end=None, resample=None, nproc=1, **kwargs): """Read a `TimeSeries` from a cache of data files using multiprocessing. The inner-workings are agnostic of data-type, but can only handle a single data type at a time. Parameters ---------- cache : :class:`glue.lal.Cache`, `str` cache of GWF frame files, or path to a LAL-format cache file on disk channel : :class:`~gwpy.detector.channel.Channel`, `str` data channel to read from frames start : `Time`, :lalsuite:`LIGOTimeGPS`, optional start GPS time of desired data end : `Time`, :lalsuite:`LIGOTimeGPS`, optional end GPS time of desired data resample : `float`, optional rate (samples per second) to resample format : `str`, optional name of data file format, e.g. ``gwf`` or ``hdf``. nproc : `int`, default: ``1`` maximum number of independent frame reading processes, default is set to single-process file reading. Notes ----- The number of independent processes spawned by this function can be calculated as ``min(maxprocesses, len(cache)//minprocesssize)``. Returns ------- data : :class:`~gwpy.timeseries.core.TimeSeries` a new `TimeSeries` containing the data read from disk """ cls = kwargs.pop('target', TimeSeries) # open cache from file if given if isinstance(cache, (unicode, str, file)): cache = open_cache(cache) # fudge empty cache if len(cache) == 0: return cls([], channel=channel, epoch=start) # use cache to get start end times cache.sort(key=lambda ce: ce.segment[0]) if start is None: start = cache[0].segment[0] if end is None: end = cache[-1].segment[1] # get span span = Segment(start, end) if cls not in (StateVector, StateVectorDict) and resample: cache = cache.sieve(segment=span.protract(8)) else: cache = cache.sieve(segment=span) cspan = Segment(cache[0].segment[0], cache[-1].segment[1]) # if reading one channel, try to use lalframe, its faster if (isinstance(channel, str) or (isinstance(channel, (list, tuple)) and len(channel) == 1)): try: from lalframe import frread except ImportError: format_ = 'gwf' else: kwargs.pop('type', None) format_ = 'lalframe' # otherwise use the file extension as the format else: format_ = os.path.splitext(cache[0].path)[1][1:] # force one frame per process minimum nproc = min(nproc, len(cache)) # single-process if nproc <= 1: return cls.read(cache, channel, format=format_, start=start, end=end, resample=resample, **kwargs) # define how to read each frame def _read(q, pstart, pend): # don't go beyond the requested limits pstart = float(max(start, pstart)) pend = float(min(end, pend)) # if resampling TimeSeries, pad by 8 seconds inside cache limits if cls not in (StateVector, StateVectorDict) and resample: cstart = float(max(cspan[0], pstart - 8)) subcache = cache.sieve(segment=Segment(cstart, pend)) out = cls.read(subcache, channel, format=format_, start=cstart, end=pend, resample=None, **kwargs) out = out.resample(resample) q.put(out.crop(pstart, pend)) else: subcache = cache.sieve(segment=Segment(pstart, pend)) q.put( cls.read(subcache, channel, format=format_, start=pstart, end=pend, resample=resample, **kwargs)) # separate cache into parts fperproc = int(ceil(len(cache) / nproc)) subcaches = [ Cache(cache[i:i + fperproc]) for i in range(0, len(cache), fperproc) ] subsegments = SegmentList( [Segment(c[0].segment[0], c[-1].segment[1]) for c in subcaches]) # start all processes queue = ProcessQueue(nproc) proclist = [] for subseg in subsegments: process = Process(target=_read, args=(queue, subseg[0], subseg[1])) process.daemon = True proclist.append(process) process.start() # get data and block data = [queue.get() for p in proclist] for process in proclist: process.join() # format and return if issubclass(cls, dict): try: data.sort(key=lambda tsd: tsd.values()[0].epoch.gps) except IndexError: pass out = cls() while len(data): tsd = data.pop(0) out.append(tsd) del tsd return out else: out = TimeSeriesList(*data) out.sort(key=lambda ts: ts.epoch.gps) ts = out.join() return ts
def __init__(self): super(DistributorProcess, self).__init__(ProcessQueue(), ProcessEvent())
def from_timeseries(ts1, ts2, stride, fftlength=None, fftstride=None, window=None, nproc=1, **kwargs): """Calculate the coherence `Spectrogram` between two `TimeSeries`. Parameters ---------- timeseries : :class:`~gwpy.timeseries.core.TimeSeries` input time-series to process. stride : `float` number of seconds in single PSD (column of spectrogram). fftlength : `float` number of seconds in single FFT. fftstride : `int`, optiona, default: fftlength number of seconds between FFTs. window : `timeseries.window.Window`, optional, default: `None` window function to apply to timeseries prior to FFT. nproc : `int`, default: ``1`` maximum number of independent frame reading processes, default is set to single-process file reading. Returns ------- spectrogram : :class:`~gwpy.spectrogram.core.Spectrogram` time-frequency power spectrogram as generated from the input time-series. """ # format FFT parameters if fftlength is None: fftlength = stride if fftstride is None: fftstride = fftlength sampling = min(ts1.sample_rate.value, ts2.sample_rate.value) # get size of spectrogram nFFT = int(fftlength * sampling) nsteps = int(ts1.size // (stride * ts1.sample_rate.value)) nproc = min(nsteps, nproc) # single-process return if nsteps == 0 or nproc == 1: return _from_timeseries(ts1, ts2, stride, fftlength=fftlength, fftstride=fftstride, window=window, **kwargs) # wrap spectrogram generator def _specgram(q, ts): try: q.put(_from_timeseries(ts, ts2, stride, fftlength=fftlength, fftstride=fftstride, window=window, **kwargs)) except Exception as e: q.put(e) # otherwise build process list stepperproc = int(ceil(nsteps / nproc)) nsamp = [stepperproc * ts.sample_rate.value * stride for ts in (ts1, ts2)] queue = ProcessQueue(nproc) processlist = [] for i in range(nproc): process = Process(target=_specgram, args=(queue, ts1[i * nsamp[0]:(i + 1) * nsamp[0]], ts2[i * nsamp[1]:(i + 1) * nsamp[1]])) process.daemon = True processlist.append(process) process.start() if ((i + 1) * nsamp[0]) >= ts1.size: break # get data data = [] for process in processlist: result = queue.get() if isinstance(result, Exception): raise result else: data.append(result) # and block for process in processlist: process.join() # format and return out = SpectrogramList(*data) out.sort(key=lambda spec: spec.epoch.gps) return out.join()
def search_parallel(username, password, client_matter, q, num_workers=15): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start ''' login_token = call(call="login", method="POST", username=username, password=password)['login_token'] first_page = call(call="search", method="GET", q=q, login_token=login_token, client_matter=client_matter) num_first_page = len(first_page['search_results']) num_results = first_page['count'] # The main thread removes them from searchqueue and puts them into a list. results = [None] * num_results results[:num_first_page] = first_page['search_results'] logging.info("Downloading %s Results, already got first %d" % (num_results, num_first_page)) # Put all of the search ranges into the result queue dlqueue = ProcessQueue() NUM_AT_ONCE = 20 for i in xrange(num_first_page, num_results, NUM_AT_ONCE): limit = min(num_results, i + NUM_AT_ONCE) - i logging.info("Added: %s --> %s" % (i, i + limit)) dlqueue.put((i, limit)) # The processes will put their results into the searchqueue searchqueue = ProcessQueue() # Start up the parallel processes pool = MultiProcessPool( processes=num_workers, initializer=_search_worker, initargs=[username, password, client_matter, q, dlqueue, searchqueue]) try: # Continue until the processing queue is empty. while True: # It takes about 15 seconds to download a docket, so wait that long. time.sleep(2.0 / num_workers) got = 0 try: item = searchqueue.get_nowait() start, end = item['offset'], item['offset'] + item['limit'] results[start:end] = item['result']['search_results'] logging.info("Downloaded: %s --> %s (of %d total)" % (start, end, num_results)) got += 1 except Empty: left = len(results) - len(filter(None, results)) if left <= 0: break logging.info("Got %d, %d results. Waiting for %d more." % (got, len(results), left)) continue except Exception as e: logging.info("Main thread loop exception: %s" % e) break except KeyboardInterrupt as e: logging.info("Main thread exception: %s" % e) dlqueue.close() searchqueue.close() pool.close() pool.terminate() # Return what we have even if there was an exception. return results for i, r in enumerate(results): if not r: print("Missing Result %s" % (i + 1)) return { 'search_results': results, 'count': num_results, }
def read_cache(cache, channel, start=None, end=None, resample=None, gap=None, pad=None, nproc=1, format=None, **kwargs): """Read a `TimeSeries` from a cache of data files using multiprocessing. The inner-workings are agnostic of data-type, but can only handle a single data type at a time. Parameters ---------- cache : :class:`glue.lal.Cache`, `str` cache of GWF frame files, or path to a LAL-format cache file on disk channel : :class:`~gwpy.detector.channel.Channel`, `str` data channel to read from frames start : `Time`, `~gwpy.time.LIGOTimeGPS`, optional start GPS time of desired data end : `Time`, `~gwpy.time.LIGOTimeGPS`, optional end GPS time of desired data resample : `float`, optional rate (samples per second) to resample format : `str`, optional name of data file format, e.g. ``gwf`` or ``hdf``. nproc : `int`, default: ``1`` maximum number of independent frame reading processes, default is set to single-process file reading. gap : `str`, optional how to handle gaps in the cache, one of - 'ignore': do nothing, let the undelying reader method handle it - 'warn': do nothing except print a warning to the screen - 'raise': raise an exception upon finding a gap (default) - 'pad': insert a value to fill the gaps pad : `float`, optional value with which to fill gaps in the source data, only used if gap is not given, or `gap='pad'` is given Notes ----- The number of independent processes spawned by this function can be calculated as ``min(maxprocesses, len(cache)//minprocesssize)``. Returns ------- data : :class:`~gwpy.timeseries.TimeSeries` a new `TimeSeries` containing the data read from disk """ from gwpy.segments import (Segment, SegmentList) cls = kwargs.pop('target', TimeSeries) # open cache from file if given if isinstance(cache, (unicode, str, file)): cache = open_cache(cache) # fudge empty cache if len(cache) == 0: return cls([], channel=channel, epoch=start) # use cache to get start end times cache.sort(key=lambda ce: ce.segment[0]) if start is None: start = cache[0].segment[0] if end is None: end = cache[-1].segment[1] # get span span = Segment(start, end) if cls not in (StateVector, StateVectorDict) and resample: cache = cache.sieve(segment=span.protract(8)) else: cache = cache.sieve(segment=span) cspan = Segment(cache[0].segment[0], cache[-1].segment[1]) # check for gaps if gap is None and pad is not None: gap = 'pad' elif gap is None: gap = 'raise' segs = cache_segments(cache, on_missing='ignore') & SegmentList([span]) if len(segs) != 1 and gap.lower() == 'ignore' or gap.lower() == 'pad': pass elif len(segs) != 1: gaps = SegmentList([cspan]) - segs msg = ("The cache given to %s.read has gaps in it in the " "following segments:\n %s" % (cls.__name__, '\n '.join(map(str, gaps)))) if gap.lower() == 'warn': warnings.warn(msg) else: raise ValueError(msg) segs = type(segs)([span]) # if reading a small number of channels, try to use lalframe, its faster if format is None and (isinstance(channel, str) or (isinstance(channel, (list, tuple)) and len(channel) <= MAX_LALFRAME_CHANNELS)): try: from lalframe import frread except ImportError: format = 'gwf' else: kwargs.pop('type', None) format = 'lalframe' # otherwise use the file extension as the format elif format is None: format = os.path.splitext(cache[0].path)[1][1:] # -- process multiple cache segments -------- # this entry point loops this method for each segment if len(segs) > 1: out = None for seg in segs: new = read_cache(cache, channel, start=seg[0], end=seg[1], resample=resample, nproc=nproc, format=format, target=cls, **kwargs) if out is None: out = new else: out.append(new, gap='pad', pad=pad) return out # -- process single cache segment # force one frame per process minimum nproc = min(nproc, len(cache)) # single-process if nproc <= 1: return cls.read(cache, channel, format=format, start=start, end=end, resample=resample, **kwargs) # define how to read each frame def _read(q, pstart, pend): try: # don't go beyond the requested limits pstart = float(max(start, pstart)) pend = float(min(end, pend)) # if resampling TimeSeries, pad by 8 seconds inside cache limits if cls not in (StateVector, StateVectorDict) and resample: cstart = float(max(cspan[0], pstart - 8)) subcache = cache.sieve(segment=Segment(cstart, pend)) out = cls.read(subcache, channel, format=format, start=cstart, end=pend, resample=None, **kwargs) out = out.resample(resample) q.put(out.crop(pstart, pend)) else: subcache = cache.sieve(segment=Segment(pstart, pend)) q.put( cls.read(subcache, channel, format=format, start=pstart, end=pend, resample=resample, **kwargs)) except Exception as e: q.put(e) # separate cache into parts fperproc = int(ceil(len(cache) / nproc)) subcaches = [ Cache(cache[i:i + fperproc]) for i in range(0, len(cache), fperproc) ] subsegments = SegmentList( [Segment(c[0].segment[0], c[-1].segment[1]) for c in subcaches]) # start all processes queue = ProcessQueue(nproc) proclist = [] for subseg in subsegments: process = Process(target=_read, args=(queue, subseg[0], subseg[1])) process.daemon = True proclist.append(process) process.start() # get data and block data = [queue.get() for p in proclist] for result in data: process.join() if isinstance(result, Exception): raise result # format and return if issubclass(cls, dict): try: data.sort(key=lambda tsd: tsd.values()[0].epoch.gps) except IndexError: pass out = cls() while len(data): tsd = data.pop(0) out.append(tsd) del tsd return out else: if cls in (TimeSeries, TimeSeriesDict): out = TimeSeriesList(*data) else: out = StateVectorList(*data) out.sort(key=lambda ts: ts.epoch.gps) ts = out.join(gap=gap) return ts
def _read(cls, source, *args, **kwargs): # parse input as a list of files if isinstance(source, list): files = source else: try: # try and map to a list of file-like objects files = file_list(source) except ValueError: # otherwise treat as single files = [source] # determine input format if kwargs.get('format', None) is None: kwargs['format'] = get_format('read', cls, files[0], source, args, kwargs) # calculate maximum number of processes nproc = kwargs.pop('nproc', 1) num = len(files) nproc = min(nproc, num) # read single file or single process if num == 1: return reader(cls, files[0], *args, **kwargs) if nproc == 1: return reader(cls, source, *args, **kwargs) # define multiprocessing method def _read_chunk(q, chunk, index): if len(chunk) == 1: chunk = chunk[0] try: if cls: q.put((index, reader(cls, chunk, *args, **kwargs))) else: q.put((index, reader(chunk, *args, **kwargs))) except Exception as e: q.put(e) # split source into parts numperproc = int(ceil(num / nproc)) chunks = [ type(files)(files[i:i + numperproc]) for i in range(0, num, numperproc) ] # process queue = ProcessQueue(nproc) processes = [] for i, chunk in enumerate(chunks): if len(chunk) == 0: continue process = Process(target=_read_chunk, args=(queue, chunk, i)) process.daemon = True process.start() processes.append(process) # get data and block output = [] for i in range(len(processes)): result = queue.get() if isinstance(result, Exception): raise result output.append(result) for process in processes: process.join() # return chunks sorted into input order return flatten(zip(*sorted(output, key=lambda out: out[0]))[1])
def getdocket_parallel(username, password, client_matter, docket_list, cached=False, num_workers=15, save_progress=None): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start cached: Get cached dockets instead of fresh ones from the court save_progress Use a temporary file to save work in case we crash. ''' if save_progress != None: save_progress = shelve.open(save_progress, 'c') def get_key(court, docket): return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore') dockets = [] # Put all of the tuples into a processing queue dlqueue = ProcessQueue() for court, docket in docket_list: k = get_key(court, docket) if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket)) # The processes will put their results into the docketqueue docketqueue = ProcessQueue() # The main thread removes them from docketqueue and puts them into a list. # Start up the parallel processes pool = MultiProcessPool(processes=num_workers, initializer=_dl_worker, initargs=[ username, password, client_matter, cached, dlqueue, docketqueue ]) try: # Continue until the processing queue is empty got = 0 while True: # It takes about 15 seconds to download a docket, so wait that long. time.sleep(1.0) try: # get_nowait will have raise Empty and break the loop while True: new_docket = docketqueue.get_nowait() dockets.append(new_docket) # Only save if succesful if save_progress != None and new_docket['result'][ 'success']: # Save our progress k = get_key(new_docket['court'], new_docket['docket']) save_progress[k] = new_docket got += 1 except Empty: if save_progress != None: print("Syncing dbase (len=%d), dockets=%d " % (len(save_progress), len(dockets))) save_progress.sync() left = len(docket_list) - len(dockets) if left <= 0: break logging.info("Got %d, %d total dockets. Waiting again." % (got, len(dockets))) continue except Exception as e: logging.info("Main thread loop exception: %s" % e) break except KeyboardInterrupt as e: logging.info("Main thread exception: %s" % e) dlqueue.close() docketqueue.close() pool.close() pool.terminate() # Return what we have even if there was an exception. if save_progress != None: save_progress.sync() save_progress.close() return dockets
def fit(self, data): """Main thread: adds task while semaphore free, else blocks. Other thread is used to free up finished tasks. Quite simple to just Args: data (MicroArrayData): data. """ if self.verbose: print '[Parallel] fitting {} tasks with {} process{}...'.format( len(self.tasks), self.processes, 'es' if self.processes > 1 else '') assert issubclass(type(data), MicroArrayData) start_time = time.time() # need to use two different kinds of queues, one thread-safe and one process-safe task_queue = ThreadQueue() # Pipe tasks between threads result_queue = ProcessQueue() # Pipe results back to self.tasks list # keep track of start time per task def wrap_fit(task, data, index): """Wrapper of fit method, keep track of index of in self.task list where the results will be put back to """ result_queue.put((task.fit(data), index)) # Thread - start processes and acquire semaphore def add_processes(task_queue): indices = range(len(self.tasks)) if self.randomize: random.shuffle(indices) for index in indices: task = self.tasks[index] for _ in xrange(task.processes): self._semaphore.acquire() if self.verbose >= 3: time.sleep(0.1) print '[thread-start] acquired', task.processes, 'process{} for'.format( 'ses' if task.processes > 1 else ''), task.name p = Process(target=wrap_fit, args=(task, data, index)) # Need non-daemonic threads to use multiprocessed python processes. p.daemon = False p.start() # Put tuple of process and associated task in queue. task_queue.put((p, task)) task_queue.put(None) # send sentinal thread_add_processes = Thread(target=add_processes, args=(task_queue, )) thread_add_processes.start() # Thread - maintain processes and release semaphore def handle_processes(task_queue): running_tasks = [] finished = False print_count = 1 while not finished or len(running_tasks) > 0: # check task_queue at intervals if not task_queue.empty(): next_task = task_queue.get(timeout=0.1) # receive STOP sentinal, finish if next_task is None: finished = True else: running_tasks.append(next_task) # maintain process list; for proc, task in running_tasks[:]: if not proc.is_alive(): if self.verbose >= 3: print '[thread-maintain] releasing', task.processes, 'process{} for'.format( 'ses' if task.processes > 1 else ''), task.name for _ in xrange(task.processes): self._semaphore.release() proc.terminate() running_tasks.remove((proc, task)) break # need when a process is found that is done! time.sleep(.5) # print currently running processes every once in a while. if int((time.time() - start_time) / self.print_fitting_time ) > print_count and self.verbose >= 1: print '[Parallel][{:02d}h{:02d}m] running:'.format( *divmod(print_count * 10, 60)), for _, task in running_tasks: if task == running_tasks[-1][1]: # last task print '{}'.format(task.name) else: print '{},'.format(task.name), # print '[Parallel] {} ({:d}:{:2d})'.format(task.name, *divmod(int(start_time_task[task.name] - time.time()/60), 60)) print_count += 1 thread_handle_processes = Thread(target=handle_processes, args=(task_queue, )) thread_handle_processes.start() # Thread - catch results from result_queue and put back in self.task list def handle_results(): processed_results = 0 while processed_results < len(self.tasks): task, index = result_queue.get() if self.verbose >= 3: print '[thread-result] saving result for', task.name, 'to task list' self.tasks[index] = task processed_results += 1 time.sleep(.1) thread_handle_results = Thread(target=handle_results, args=()) thread_handle_results.start() # block main thread thread_add_processes.join() thread_handle_processes.join() thread_handle_results.join() assert all((i.done for i in self.tasks))
def getdocket_parallel(username, password, client_matter, docket_list, cached=False, num_workers=15, save_progress=None, _async=False): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start cached: Get cached dockets instead of fresh ones from the court save_progress Use a temporary file to save work in case we crash. async If True, we get data asyncrhonously. ''' if save_progress != None: if _async == True: raise NotImplementedError("Cannot save progress and async.") save_progress = shelve.open(save_progress, 'c') def get_key(court, docket): return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore') dockets = [] def deb(msg, *args, **kwargs): msg = "getdocket_parallel %s-%s: %s" % (username, client_matter, msg) logging.info(msg, *args, **kwargs) # Put all of the tuples into a processing queue dlqueue = ProcessQueue() for c_vals in docket_list: c_vals = list(c_vals) if len(c_vals) < 2: raise Exception( "Expecting a list of at least two with court, " "docket, instead got: %s", c_vals) court, docket = c_vals[:2] k = get_key(court, docket) if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket)) # The processes will put their results into the docketqueue docketqueue = ProcessQueue() # The main thread removes them from docketqueue and puts them into a list. # Start up the parallel processes pool = MultiProcessPool(processes=num_workers, initializer=_dl_worker, initargs=[ username, password, client_matter, cached, dlqueue, docketqueue ]) def iterator(sleep_time=1.0): '''An iterator that goes through all of the given dockets.''' # Continue until the processing queue is empty got, iters, total = 0, 0, len(docket_list) while True: # It takes about 15 seconds to download a docket, so wait that long. iters += 1 try: time.sleep(sleep_time) # get_nowait will have raise Empty and break the loop while True: yield docketqueue.get_nowait() got += 1 except Empty: left = total - got if left <= 0: deb("Finished iterating %s" % total) break if iters % 5 == 0: deb("Did %d/%d, %d left.", got, total, left) continue except KeyboardInterrupt as e: deb("Main thread interrupt: %s" % e) break except Exception as e: deb("Main thread loop exception: %s" % e) break dlqueue.close() docketqueue.close() pool.close() pool.terminate() if _async: return iterator for new_i, new_docket in enumerate(iterator()): dockets.append(new_docket) # Only save if succesful if save_progress != None and new_docket['result']['success']: # Save our progress k = get_key(new_docket['court'], new_docket['docket']) save_progress[k] = new_docket elif save_progress != None and new_i % 20 == 0: deb("sync dbase len=%d, added=%d ", len(save_progress), 'got') save_progress.sync() # Return what we have even if there was an exception. if save_progress != None: save_progress.sync() save_progress.close() return dockets
def create_queue(self, queue_limit): return ProcessQueue(queue_limit)
if save_progress != None: if async: raise NotImplementedError("Cannot save progress and async.") save_progress = shelve.open(save_progress, 'c') def get_key(court, docket): return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore') dockets = [] def deb(msg, *args, **kwargs): msg = "getdocket_parallel %s-%s: %s" % (username, client_matter, msg) logging.info(msg, *args, **kwargs) # Put all of the tuples into a processing queue dlqueue = ProcessQueue() for c_vals in docket_list: c_vals = list(c_vals) if len(c_vals) < 2: raise Exception( "Expecting a list of at least two with court, " "docket, instead got: %s", c_vals) court, docket = c_vals[:2] k = get_key(court, docket) if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket))
def from_timeseries(timeseries, stride, fftlength=None, fftstride=None, method='welch', window=None, plan=None, nproc=1): """Calculate the average power spectrogram of this `TimeSeries` using the specified average spectrum method. Parameters ---------- timeseries : :class:`~gwpy.timeseries.core.TimeSeries` input time-series to process. stride : `float` number of seconds in single PSD (column of spectrogram). fftlength : `float` number of seconds in single FFT. method : `str`, optional, default: 'welch' average spectrum method. fftstride : `int`, optiona, default: fftlength number of seconds between FFTs. window : `timeseries.window.Window`, optional, default: `None` window function to apply to timeseries prior to FFT. plan : :lalsuite:`REAL8FFTPlan`, optional LAL FFT plan to use when generating average spectrum, substitute type 'REAL8' as appropriate. nproc : `int`, default: ``1`` maximum number of independent frame reading processes, default is set to single-process file reading. Returns ------- spectrogram : :class:`~gwpy.spectrogram.core.Spectrogram` time-frequency power spectrogram as generated from the input time-series. """ # format FFT parameters if fftlength is None: fftlength = stride if fftstride is None: fftstride = fftlength # get size of spectrogram nFFT = int(fftlength * timeseries.sample_rate.value) nsteps = int(timeseries.size // (stride * timeseries.sample_rate.value)) nproc = min(nsteps, nproc) # generate window and plan if needed try: from lal import lal except ImportError: pass else: if window is None: window = psd.generate_lal_window(nFFT, dtype=timeseries.dtype) if plan is None: plan = psd.generate_lal_fft_plan(nFFT, dtype=timeseries.dtype) # single-process return if nsteps == 0 or nproc == 1: return _from_timeseries(timeseries, stride, fftlength=fftlength, fftstride=fftstride, method=method, window=window) # wrap spectrogram generator def _specgram(q, ts): try: q.put( _from_timeseries(ts, stride, fftlength=fftlength, fftstride=fftstride, method=method, window=window, plan=plan)) except Exception as e: q.put(e) # otherwise build process list stepperproc = int(ceil(nsteps / nproc)) nsamp = stepperproc * timeseries.sample_rate.value * stride queue = ProcessQueue(nproc) processlist = [] for i in range(nproc): process = Process(target=_specgram, args=(queue, timeseries[i * nsamp:(i + 1) * nsamp])) process.daemon = True processlist.append(process) process.start() if ((i + 1) * nsamp) >= timeseries.size: break # get data data = [] for process in processlist: result = queue.get() if isinstance(result, Exception): raise result else: data.append(result) # and block for process in processlist: process.join() # format and return out = SpectrogramList(*data) out.sort(key=lambda spec: spec.epoch.gps) return out.join()
def read_cache(cache, target, nproc, post, *args, **kwargs): """Read arbitrary data from a cache file Parameters ---------- cache : :class:`glue.lal.Cache`, `str` cache of files files, or path to a LAL-format cache file on disk. target : `type` target class to read into. nproc : `int` number of individual processes to use. post : `function` function to post-process output object before returning. The output of this method will be returns, so in-place operations must return the object. *args other positional arguments to pass to the target.read() classmethod. **kwargs keyword arguments to pass to the target.read() classmethod. Returns ------- data : target an instance of the target class, seeded with data read from the cache. Notes ----- The returned object is constructed from the output of each sub-process via the '+=' in-place addition operator. If the input cache is indeed a :class:`~glue.lal.Cache` object, the sub-processes will be combined in time order, otherwise the ordering is given by the order of entries in the input cache (for example, if it is a simple `list` of files). .. warning:: no protection is given against overloading the host, for example, no checks are done to ensure that ``nproc`` is less than the number of available cores. High values of ``nproc`` should be used at the users discretion, the GWpy team accepts to liability for loss as a result of abuse of this feature. """ # read the cache if isinstance(cache, (file, unicode, str)): cache = open_cache(cache) if isinstance(cache, Cache): cache.sort(key=lambda ce: ce.segment[0]) # force one file per process minimum nproc = min(nproc, len(cache)) if nproc > cpu_count(): warnings.warn("Using %d processes on a %d-core machine is " "unrecommended...but not forbidden." % (nproc, cpu_count())) # work out underlying data type try: kwargs.setdefault( 'format', _get_valid_format('read', target, None, None, (cache[0],), {})) # if empty, put anything, since it doesn't matter except IndexError: kwargs.setdefault('format', 'ascii') except Exception: if 'format' not in kwargs: raise if nproc <= 1: return target.read(cache, *args, **kwargs) # define how to read each sub-cache def _read(q, sc, i): try: q.put((i, target.read(sc, *args, **kwargs))) except Exception as e: q.put(e) # separate cache into parts fperproc = int(ceil(len(cache) / nproc)) subcaches = [cache.__class__(cache[i:i+fperproc]) for i in range(0, len(cache), fperproc)] # start all processes queue = ProcessQueue(nproc) proclist = [] for i, subcache in enumerate(subcaches): if len(subcache) == 0: continue process = Process(target=_read, args=(queue, subcache, i)) process.daemon = True proclist.append(process) process.start() # get data and block pout = [] for i in range(len(proclist)): result = queue.get() if isinstance(result, Exception): raise result pout.append(result) for process in proclist: process.join() # combine and return data = zip(*sorted(pout, key=lambda out: out[0]))[1] if issubclass(target, Table): # astropy.table.Table out = vstack_tables(data, join_type='exact') elif issubclass(target, recarray): out = recfunctions.stack_arrays(data, asrecarray=True, usemask=False, autoconvert=True).view(target) else: try: if hasattr(target, 'tableName'): # glue.ligolw.table.Table out = data[0] else: out = data[0].copy() except AttributeError: out = data[0] for datum in data[1:]: out += datum if post: return post(out) else: return out
class MultiProcessWorker(MultiWorker): max_worker_count = cpu_count() queue_type = ProcessQueue() worker_type = Process