def run_example(): # Make an iterableQueue instance iq = IterableQueue() # Start a bunch of producers, give each one a producer endpoint producers = [] for producer_id in range(NUM_PRODUCERS): queue = iq.get_producer() p = Process(target=producer_func, args=(queue, producer_id)) p.start() producers.append(p) # And start a bunch of consumers consumers = [] for consumer_id in range(NUM_CONSUMERS): # Give each consumer a "consumer-queue" consumer_endpoint = iq.get_consumer() p = Process(target=consumer_func, args=(consumer_endpoint, consumer_id)) p.start() consumers.append(p) # Lastly, *this is important*, close the IterableQueue. iq.close() # This indicates no new producers endpoints will be made # Wait for workers to finish for p in producers + consumers: p.join()
def extract_all_features_from_archive(archive_path, log=None): start = time.time() # First, make an iterable queue. Extract all the corenlp files from the # archive and load them onto it fnames_q = IterableQueue() fnames_producer = fnames_q.get_producer() archive = tarfile.open(archive_path) # Extract each member, putting it's path and contents on the queue try: for member in archive: # Extract the contents of the corenlp files, putting the text # for each file directly onto the queue if member.name.endswith('xml'): fnames_producer.put( (member.name, archive.extractfile(member).read())) # If we encounter corruption in the archive, log or print a warning # and proceed with the processing of what was extracted so far. except IOError, e: message = '%s\tlast file was: %s' % (str(e), member.name) if log: log.write(message) else: print message
def __init__( self, base_iterator, n_batches, batch_processor_cls, enqueue_fn=None, # batch_processor_cls : mldc.data.data_handler.CustomBatchProcessor n_workers=3, qcap=5, *args, **kwargs): LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap) self._base_iterator = base_iterator # mldc.data.data_handler.DataIterator self._n_batches = n_batches # 2 self._todoq = IterableQueue(qcap) self._doneq = IterableQueue(qcap) self._enqueue_fn = enqueue_fn # None self._workers = [] self._end_sigq = Queue() # use threading here to avoid pickling, particularly since this process is fairly lightweight. self._producer = Thread(target=self.enq_examples_for_workers, args=( self._todoq.get_producer(), self._end_sigq, )) # <Thread(Thread-20, initial)> for wid in range(1, n_workers + 1): # n_workers = 1 worker_todo = self._todoq.get_consumer( ) # iterable_queue.iq.ConsumerQueue worker_done = self._doneq.get_producer( ) # iterable_queue.iq.ProducerQueue w = mp.Process(target=BatchQueue._worker_loop, args=(worker_todo, worker_done, wid, batch_processor_cls, *args), kwargs=kwargs) w.start() self._workers.append(w) self._main_done = self._doneq.get_consumer() self._producer.start() self._todoq.close() self._doneq.close()
def __init__(self, base_iterator, n_batches, batch_processor_cls, enqueue_fn=None, n_workers=3, qcap=5, *args, **kwargs): LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap) self._base_iterator = base_iterator self._n_batches = n_batches self._todoq = IterableQueue(qcap) self._doneq = IterableQueue(qcap) self._enqueue_fn = enqueue_fn self._workers = [] self._end_sigq = Queue() # use threading here to avoid pickling, particularly since this process is fairly lightweight. self._producer = Thread(target=self.enq_examples_for_workers, args=( self._todoq.get_producer(), self._end_sigq, )) for wid in range(1, n_workers + 1): worker_todo = self._todoq.get_consumer() worker_done = self._doneq.get_producer() w = mp.Process(target=BatchQueue._worker_loop, args=(worker_todo, worker_done, wid, batch_processor_cls, *args), kwargs=kwargs) w.start() self._workers.append(w) self._main_done = self._doneq.get_consumer() self._producer.start() self._todoq.close() self._doneq.close()
def extract_all_features(articles_dir, limit=None): start = time.time() # First, make an iterable queue and load all the article fnames onto it fnames_q = IterableQueue() fnames_producer = fnames_q.get_producer() for fname in get_fnames(articles_dir)[:limit]: fnames_producer.put(fname) fnames_producer.close() # Make a queue to hold feature stats (results), and a consumer to # receive completed feature stats objects from workers features_q = IterableQueue() features_consumer = features_q.get_consumer() # Create workers that consume filenames and produce feature counts. for p in range(NUM_ARTICLE_LOADING_PROCESSES): fnames_consumer = fnames_q.get_consumer() features_producer = features_q.get_producer() process = Process(target=extract_features_from_articles, args=(fnames_consumer, features_producer, False)) process.start() # Close the iterable queues fnames_q.close() features_q.close() # Accumulate the results. This blocks until workers are finished feature_accumulator = make_feature_accumulator() for accumulator in features_consumer: feature_accumulator.merge(accumulator) elapsed = time.time() - start print 'elapsed', elapsed return feature_accumulator
# If we encounter corruption in the archive, log or print a warning # and proceed with the processing of what was extracted so far. except IOError, e: message = '%s\tlast file was: %s' % (str(e), member.name) if log: log.write(message) else: print message # We're done adding files to the queue fnames_producer.close() # Make a queue to hold feature stats (results), and a consumer to # receive completed feature stats objects from workers features_q = IterableQueue() features_consumer = features_q.get_consumer() # Create workers that consume filenames and produce feature counts. for p in range(NUM_ARTICLE_LOADING_PROCESSES): fnames_consumer = fnames_q.get_consumer() features_producer = features_q.get_producer() process = Process(target=extract_features_from_articles, args=(fnames_consumer, features_producer, 'content')) process.start() # We're done making endpoints for the queues fnames_q.close() features_q.close() # We're going to accumulate the results. Make some containers for that.
def generate_dataset_parallel(self, save_dir=None): ''' Parallel version of generate_dataset_serial. Each worker is responsible for saving its own part of the dataset to disk, called a macrobatch. the files are saved at 'save_dir/examples/<batch-num>.npz'. ''' # This cannot be called before calling prepare(), unless a prepared # UnigramDictionary was passed to the self's constructor if not self.is_prepared(): raise DataSetReaderIllegalStateException( "DatasetReader: generate_examples() cannot be called " "before prepare() is called unless a prepared " "UnigramDictionary has was passed into the " "DatasetReader's constructor.") # We save dataset in the "examples" subdir of the model_dir if save_dir is not None: examples_dir = os.path.join(save_dir, 'examples') # We are willing to create both the save_dir, and the # 'examples' subdir, but not their parents if not os.path.exists(save_dir): os.mkdir(save_dir) if not os.path.exists(examples_dir): os.mkdir(examples_dir) else: examples_dir = None file_queue = IterableQueue() macrobatch_queue = IterableQueue(self.max_queue_size) # Put all the filenames on a producer queue file_producer = file_queue.get_producer() for filename in self.generate_filenames(): file_producer.put(filename) file_producer.close() # Start a bunch of worker processes for process_num in range(self.num_processes): # Hop to a new location in the random-number-generator's state # chain reseed() # Start child process that generates a portion of the dataset args = (file_queue.get_consumer(), macrobatch_queue.get_producer()) Process(target=self.generate_dataset_worker, args=args).start() # This will receive the macrobatches from all workers macrobatch_consumer = macrobatch_queue.get_consumer() # Close the iterable queues file_queue.close() macrobatch_queue.close() for signal_macrobatch, noise_macrobatch in macrobatch_consumer: if self.verbose: print('receiving macrobatch from child process') yield signal_macrobatch, noise_macrobatch # Explicitly close up macrobatch_consumer, which hopefully fixes the EOFError macrobatch_consumer.close()
def generate_dataset_parallel(self, save_dir=None): ''' Parallel version of generate_dataset_serial. Each worker is responsible for saving its own part of the dataset to disk, called a macrobatch. the files are saved at 'save_dir/examples/<batch-num>.npz'. ''' # This cannot be called before calling prepare(), unless a prepared # UnigramDictionary was passed to the self's constructor if not self.prepared: raise DataSetReaderIllegalStateException( "DatasetReader: generate_examples() cannot be called " "before prepare() is called unless a prepared " "UnigramDictionary has was passed into the DatasetReader's " "constructor." ) # We save dataset in the "examples" subdir of the model_dir if save_dir is not None: examples_dir = os.path.join(save_dir, 'examples') # We are willing to create both the save_dir, and the # 'examples' subdir, but not their parents if not os.path.exists(save_dir): os.mkdir(save_dir) if not os.path.exists(examples_dir): os.mkdir(examples_dir) else: examples_dir = None file_queue = IterableQueue() macrobatch_queue = IterableQueue(self.max_queue_size) # Put all the filenames on a producer queue file_producer = file_queue.get_producer() for filename in self.generate_filenames(): file_producer.put(filename) file_producer.close() # Start a bunch of worker processes for process_num in range(self.num_processes): # Hop to a new location in the random-number-generator's state # chain reseed() # Start child process that generates a portion of the dataset args = ( file_queue.get_consumer(), macrobatch_queue.get_producer() ) Process(target=self.generate_dataset_worker, args=args).start() # This will receive the macrobatches from all workers macrobatch_consumer = macrobatch_queue.get_consumer() # Close the iterable queues file_queue.close() macrobatch_queue.close() # Retrieve the macrobatches from the workers, write them to file signal_macrobatches = [] noise_macrobatches = [] macrobatch_num = -1 for signal_macrobatch, noise_macrobatch in macrobatch_consumer: if self.verbose: print 'receiving macrobatch from child process' #macrobatch_num += 1 #if examples_dir is not None: # save_path = os.path.join( # examples_dir, '%d.npz' % macrobatch_num # ) # np.savez( # save_path, # signal_examples=signal_macrobatch, # noise_examples=noise_macrobatch # ) yield signal_macrobatch, noise_macrobatch
class BatchQueue: """ Wraps an iterator with a parallel asynchronous mechanism for queuing multiple batches at the same time. Implemented as a pool of resusable processes over two iterable threadsafe queues, avoiding process creation and setup (e.g. load fasttext) overhead. The producer process takes a batch from the base iter puts it on `todoq` (producer). A worker process takes a batch off `todoq` (consumer). The worker process processes the batch and places the result on `doneq` (producer). The main process takes a processed batch off `doneq` (consumer). To cleanly end iteration prematurely, call close() on the BatchQueue object. """ @staticmethod def _worker_loop(todoq, doneq, wid, proc_class, *args, **kwargs): device_id = 0 world_size = 1 _set_cuda(True, device_id, world_size) # setup processor = proc_class(*args, **kwargs) # CustomBatchProcessor for raw_batch in todoq: # raw_batch processed_batch = processor.process_batch(raw_batch) # 2 doneq.put(pickle.dumps(processed_batch)) doneq.close() def __init__( self, base_iterator, n_batches, batch_processor_cls, enqueue_fn=None, # batch_processor_cls : mldc.data.data_handler.CustomBatchProcessor n_workers=3, qcap=5, *args, **kwargs): LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap) self._base_iterator = base_iterator # mldc.data.data_handler.DataIterator self._n_batches = n_batches # 2 self._todoq = IterableQueue(qcap) self._doneq = IterableQueue(qcap) self._enqueue_fn = enqueue_fn # None self._workers = [] self._end_sigq = Queue() # use threading here to avoid pickling, particularly since this process is fairly lightweight. self._producer = Thread(target=self.enq_examples_for_workers, args=( self._todoq.get_producer(), self._end_sigq, )) # <Thread(Thread-20, initial)> for wid in range(1, n_workers + 1): # n_workers = 1 worker_todo = self._todoq.get_consumer( ) # iterable_queue.iq.ConsumerQueue worker_done = self._doneq.get_producer( ) # iterable_queue.iq.ProducerQueue w = mp.Process(target=BatchQueue._worker_loop, args=(worker_todo, worker_done, wid, batch_processor_cls, *args), kwargs=kwargs) w.start() self._workers.append(w) self._main_done = self._doneq.get_consumer() self._producer.start() self._todoq.close() self._doneq.close() def enq_examples_for_workers(self, todo_queue, end_queue): print('enq_examples_for_workers first') for bid, batch in enumerate( self._base_iterator ): #type(batch) : MetaBatch / batch.__len__() : 1/ batch[0].__len__() : 2/ type(batch[0][0]) : Batch.keys() dict_keys(['seq_word_feat', 'orig_text', 'neg_orig_text', 'dlg_len', 'dlg_id', 'domain_id', 'task_id', 'neg_seq_word_feat', 'index', 'out_tokens']) if self._enqueue_fn: batch = self._enqueue_fn(batch) # 2 Batch (support, target) while True: try: todo_queue.put(batch, block=True, timeout=1) break except Full: # try again, but before that check whether stop was requested time.sleep(0) # yield control to other threads for now pass finally: # stop putting stuff in the queue if end signaled if not end_queue.empty(): todo_queue.close() return todo_queue.close() def close(self): """ Note: must be called explicitly since putting this in `__del__` doesn't work.""" # stop generating data self._end_sigq.put("stop") # Drain the queue for _ in self._main_done: pass # note this cannot be done before draining self._producer.join() def __iter__(self): for item in self._main_done: temp = pickle.loads(item) yield temp def __len__(self): return self._n_batches
def get_async_batch_iterator(self): ''' Builds an asynchronous minibatching pipeline, which reads all dataset files, parses them, generates training examples, and packages those training examples into minibatches. Finally, it yields an iterable of minibatches, taking the form of an IterableQueue.ConsumerQueue. (no Inputs) OUTPUTS * [iterable (IterableQueue.ConsumerQueue)]: Iterable of minibatches. ''' # TODO: currently the only randomness in minibatching comes from # the signal context and noise contexts that are drawn for a # given entity query tuple. But the entity query tuples are read # deterministically in order through the corpus Ideally examples # should be totally shuffled.. file_queue = IterableQueue() example_queue = IterableQueue() minibatch_queue = IterableQueue() # Fill the file queue file_producer = file_queue.get_producer() for filename in self.generate_filenames(): file_producer.put(filename) file_producer.close() # Make processes that process the files and put examples onto # the example queue for i in range(self.num_example_generators): # These calls to np.random are a hack to ensure that each # child example-generating process gets different randomness #reseed() Process(target=self.generate_examples_async, args=( file_queue.get_consumer(), example_queue.get_producer() )).start() # Make a processes that batches the files and puts examples onto # the minibatch queue Process(target=self.generate_minibatches_async, args=( example_queue.get_consumer(), minibatch_queue.get_producer() )).start() # Before closing the queues, make a consumer that will be used for # yielding minibatches to the external call for iteration. self.minibatch_consumer = minibatch_queue.get_consumer() # Close all queues file_queue.close() example_queue.close() minibatch_queue.close() # Return the minibatch_consumer as the iterator return self.minibatch_consumer
def __iter__(self): ''' Builds an asynchronous minibatching pipeline, which reads all dataset files, parses them, generates training examples, and packages those training examples into minibatches. Finally, it yields an iterable of minibatches, taking the form of an IterableQueue.ConsumerQueue. (no Inputs) OUTPUTS * [iterable (IterableQueue.ConsumerQueue)]: Iterable of minibatches. ''' # TODO: currently the only randomness in minibatching comes from # the signal context and noise contexts that are drawn for a # given entity query tuple. But the entity query tuples are read # deterministically in order through the corpus Ideally examples # should be totally shuffled.. file_queue = IterableQueue() example_queue = IterableQueue() minibatch_queue = IterableQueue() # Fill the file queue file_producer = file_queue.get_producer() for filename in self.generate_filenames(): file_producer.put(filename) file_producer.close() # Make processes that process the files and put examples onto # the example queue for i in range(self.num_example_generators): Process(target=self.generate_examples_async, args=( file_queue.get_consumer(), example_queue.get_producer() )).start() # Make a processes that batches the files and puts examples onto # the minibatch queue Process(target=self.generate_minibatches_async, args=( example_queue.get_consumer(), minibatch_queue.get_producer() )).start() # Before closing the queues, make a consumer that will be used for # yielding minibatches to the external call for iteration. self.minibatch_consumer = minibatch_queue.get_consumer() # Close all queues file_queue.close() example_queue.close() minibatch_queue.close() # This is necessary because accessing randomness in the child # processes doesn't advance the random state here in the parent # process, which would, mean that the exact same minibatch sequence # would being generated on subsequent calls to `__iter__()`, which # is not desired. The simplest solution is to advance the # random state by sampling randomness once. np.random.uniform() # Return the minibatch_consumer as the iterator return self.minibatch_consumer
def fit(dictionary=None, files=[], dirs=[], match='', skip='$.^', batch_size=1000, num_topics=DEFAULT_NUM_TOPICS, time_range=None, alpha=None, beta=0.1, num_procs=NUM_PROCS, read=None, num_docs=None, min_frequency=5, num_epochs=100): # If we don't have the number of documents or a dictionary, then # run over the full dataset once to accumulate that information. if dictionary is None or num_docs is None or time_range is None: dictionary, num_docs, found_time_range = (get_corpus_stats( files=files, dirs=dirs, match=match, skip=skip, batch_size=batch_size, num_procs=num_procs, read=read, stopwords=STOPWORDS, min_frequency=min_frequency)) if time_range is None: time_range = found_time_range if alpha is None: alpha = 1. total_docs = sum(num_docs) proc_doc_indices = [sum(num_docs[:i]) for i in range(len(num_docs) + 1)] m = np.ones((total_docs, num_topics)) n = np.ones((len(dictionary), num_topics)) psi = np.ones((num_topics, 2)) #TODO: move worker creation outside of the epoch -- keep same worker pool # between epochs. Workers can receive updates about m and n etc. over the # queue. for epoch in range(num_epochs): # Show progress print(float(epoch) / num_epochs * 100) # Pre-calculate the denominator in the sum of the probability dist n_denom = (n + beta).sum(axis=0) - 1 B = np.array([beta_func(*psi_vals) for psi_vals in psi]) denom = n_denom * B # The workers should calculate probabilities and then sample, producing # updates to m and n. updates_queue = IterableQueue() ctx = mp.get_context("spawn") for proc_num in range(num_procs): # Advance the randomness so children don't all get same seed np.random.random() doc_iterator = DocumentIterator( read=read, files=files, dirs=dirs, match=match, skip=skip, batch_size=batch_size, fold='%s/%s' % (proc_num, num_procs), ) m_slice = m[proc_doc_indices[proc_num]:proc_doc_indices[proc_num + 1]] p = ctx.Process(target=worker, args=(proc_num, doc_iterator, dictionary, num_topics, time_range, alpha, beta, psi, n, m_slice, denom, updates_queue.get_producer())) p.start() updates_consumer = updates_queue.get_consumer() updates_queue.close() # Update m, n, and psi n = np.zeros((len(dictionary), num_topics)) m = np.zeros((total_docs, num_topics)) psi_updates = [[] for i in range(num_topics)] for proc_num, m_update, n_update, psi_update in updates_consumer: n += n_update start_idx = proc_doc_indices[proc_num] stop_idx = proc_doc_indices[proc_num + 1] m[start_idx:stop_idx] = m_update for i in range(num_topics): psi_updates[i].extend(psi_update[i]) # Update psi for i in range(num_topics): psi[i] = fit_psi(psi_updates[i]) return m, n, psi, dictionary
def get_corpus_stats(files=[], dirs=[], match='', skip='$.^', batch_size=1000, num_procs=NUM_PROCS, read=None, stopwords=STOPWORDS, min_frequency=5): """ Build a dictionary by running through the dataset fully. prune back according to min_frequency. Ignore stopwords given. This dictionary facilitates the conversion between tokens and integers. """ # Start meany workers. Each will make a dictionary over a subset of the # documents. They return their dictionaries over a queue. worker_dictionary_queue = IterableQueue() worker_num_docs_queue = IterableQueue() worker_time_range_queue = IterableQueue() ctx = mp.get_context('spawn') for proc_num in range(num_procs): doc_iterator = DocumentIterator( read=read, files=files, dirs=dirs, match=match, skip=skip, batch_size=batch_size, fold='%s/%s' % (proc_num, num_procs), ) args = ( proc_num, doc_iterator, worker_dictionary_queue.get_producer(), worker_num_docs_queue.get_producer(), worker_time_range_queue.get_producer(), stopwords, ) p = ctx.Process(target=dictionary_worker, args=args) p.start() # Collect the workers' dictionaries into one. worker_dictionary_consumer = worker_dictionary_queue.get_consumer() worker_dictionary_queue.close() dictionary = UnigramDictionary() for worker_dictionary in worker_dictionary_consumer: dictionary.add_dictionary(worker_dictionary) # Prune rare words from the dictionary. dictionary.prune(min_frequency) # Get the number of documents for each process worker_num_docs_consumer = worker_num_docs_queue.get_consumer() worker_num_docs_queue.close() num_docs = [count for proc_num, count in sorted(worker_num_docs_consumer)] # Get time range for all documents worker_time_range_consumer = worker_time_range_queue.get_consumer() worker_time_range_queue.close() minimum_t = 999999999999 maximum_t = 0 for min_time, max_time in worker_time_range_consumer: minimum_t = min(min_time, minimum_t) maximum_t = max(max_time, maximum_t) #buffering with 1% on both sides to ensure a nonzero chance for each document time_difference = maximum_t - minimum_t wiggle_room = time_difference / 100 time_range = (minimum_t - wiggle_room, maximum_t + wiggle_room) # Return the completed, pruned dictionary, and time range. return dictionary, num_docs, time_range