class CapturePlugin(BaseDronePlugin): def __init__(self, interfaces, channel, drone): BaseDronePlugin.__init__(self, interfaces, channel, drone, "CapturePlugin.{0}".format(channel)) self.logutil.log("Initializing") # Select interface try: self.kb = self.interfaces[0] self.kb.set_channel(self.channel) self.kb.active = True except Exception as e: print("failed to use interface") self.status = False # Pipe from the tasker to the filter module, used to send pickled tasking dictionaries (simple DictManager) recv_pconn, recv_cconn = Pipe() task_pconn, self.task_cconn = Pipe() self.task_queue = JoinableQueue() # Start the filter up self.p_filt = FilterProcess( recv_pconn, self.task_queue, self.done_event, self.task_update_event, self.drone, self.name ) self.p_filt.start() self.logutil.log("Launched FilterProcess ({0})".format(self.p_filt.pid)) self.childprocesses.append(self.p_filt) # Start the receiver up self.p_recv = SnifferProcess(recv_cconn, self.kb, self.done_event, self.drone, self.name) self.p_recv.start() self.logutil.log("Launched SnifferProcess: ({0})".format(self.p_recv.pid)) self.childprocesses.append(self.p_recv) def task(self, uuid, data): self.logutil.log("Adding Task: {0}".format(uuid)) if uuid in self.tasks: return False self.tasks[uuid] = data self.__update_filter_tasking() return True def detask(self, uuid): res = None if uuid in self.tasks: res = self.tasks.get(uuid) del self.tasks[uuid] else: return False if len(self.tasks) == 0: # Time to shut the whole party down, as we don't have any more tasks self.logutil.log("No remaining tasks, shutting down plugin") self.shutdown() # TODO return something to indicate a total shutdown also else: # We made a change to tasking, let's implement it self.__update_filter_tasking() # return res return True def __update_filter_tasking(self): self.logutil.log("Sending Task Updates to FilterProcess") self.task_queue.put_nowait(cPickle.dumps(self.tasks))
def downloadFile(sourceUrl: str, inboundQueue: JoinableQueue): """[Stream file from sourceUrl and place into queue for consumption] Args: sourceUrl (str): [The url of the source data] inboundQueue (JoinableQueue): [A queue from the multiprocessing module] """ streamingReadFromURL = requests.get(sourceUrl, stream=True) for chunk in streamingReadFromURL.iter_lines(65536): inboundQueue.put_nowait(chunk)
def __run_chm_test_procs(mems, model, regions, ntasks, nthreads): """Starts ntasks processes running __run_chm_test_proc then calls __run_chm_test_parallel.""" from multiprocessing import JoinableQueue, Process from time import sleep print("Running CHM test with %d task%s and %d thread%s per task" % (ntasks, 's' if ntasks > 1 else '', nthreads, 's' if nthreads > 1 else '')) nthreads_full = ntasks * nthreads # Start the child processes q = JoinableQueue() args = (mems, model, nthreads, q) processes = [ Process(target=__run_chm_test_proc, name="CHM-test-%d" % p, args=args) for p in xrange(ntasks) ] for p in processes: p.daemon = True p.start() sleep(0) # Run the CHM-test in parallel try: out = __run_chm_test_parallel(mems, model, regions, q, processes, nthreads_full) except: __clear_queue(q) __kill_processes(processes) raise # Tell all processes we are done and make sure they all actually terminate for _ in xrange(ntasks): q.put_nowait(None) q.close() q.join() q.join_thread() for p in processes: p.join() # Done! Return the output image return out
'--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() file_queue = JoinableQueue() result_queue = JoinableQueue() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") for file in arguments.infile: file_queue.put_nowait(file) for i in range(arguments.num_processes): file_queue.put_nowait('STOP') for i in range(arguments.num_processes): Process(target=wos_parser, args=(file_queue, result_queue, arguments.wos_only, arguments.sample_rate, arguments.must_cite, arguments.batch_size, date_after)).start() Process(target=pjk_writer, args=(result_queue, arguments.outfile, arguments.benchmark_freq)).start() file_queue.join()
def main(factor = 2): #E.G: if total cores is 2 , no of processes to be spawned is 2 * factor files_to_download = JoinableQueue() result_queue = JoinableQueue() time_taken = JoinableQueue() time_taken_to_read_from_queue = JoinableQueue() with open('downloads.txt', 'r') as f: for to_download in f: files_to_download.put_nowait(to_download.split('\n')[0]) files_to_download_size = files_to_download.qsize() cores = cpu_count() no_of_processes = cores * factor for i in xrange(no_of_processes): files_to_download.put_nowait(None) jobs = [] start = datetime.datetime.now() for name in xrange(no_of_processes): p = Process(target = download, args = (files_to_download, result_queue,\ time_taken, time_taken_to_read_from_queue,name)) p.start() jobs.append(p) for job in jobs: job.join() print result_queue.qsize() total_downloaded_urls = 0 try: while 1: r = result_queue.get_nowait() total_downloaded_urls += r except Empty: pass try: while 1: """ locals() keeps track of all variable, functions, class etc. datetime object is different from int, one cannot perform 0 + datetime.datetime.now(), if when we access the queue which contains time objects first time, total_time will be set to first time """ if 'total_time' in locals(): total_time += time_taken.get_nowait() else: total_time = time_taken.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to download {3}\ urls".format(no_of_processes, cores, total_time, \ total_downloaded_urls)) try: while 1: if 'queue_reading_time' in locals(): queue_reading_time += time_taken_to_read_from_queue.get_nowait() else: queue_reading_time = time_taken_to_read_from_queue.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to read {3}\ urls from queue".format(no_of_processes, cores,queue_reading_time\ ,files_to_download_size))
class QiubaiSpider(object): def __init__(self): self.url_pattern = 'https://www.qiushibaike.com/8hr/page/{}/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36' } # 1. 创建URL队列, 响应队列和数据队列(init) self.url_queue = JoinableQueue() self.page_queue = JoinableQueue() self.data_queue = JoinableQueue() def add_url_to_queue(self): """把url添加到队列里""" for i in range(1, 14): url = self.url_pattern.format(i) self.url_queue.put_nowait(url) def add_page_to_queue(self): """从url队列中,取出url,发送请求,获取响应数据,把响应数据,放到响应队列中""" while True: url = self.url_queue.get() response = requests.get(url, headers=self.headers) if response.status_code != 200: # 如果请求没有成功, 再次放到URL队列 self.url_queue.put(url) else: # 把响应数据添加响应队列中 self.page_queue.put(response.content.decode()) # 当URL处理完成了,就调用task_done self.url_queue.task_done() def add_date_to_queue(self): """从响应队列中取出响应数据,提取数据,把数据放到数据队列中""" while True: page = self.page_queue.get() element = etree.HTML(page) divs = element.xpath('//*[@id="content-left"]/div') # 使用xpath提取数据的原则: 先分组,再提取内容 data_list = [] for div in divs: # 定义字典保存数据 data = {} imgs = div.xpath('./div[1]/a[1]/img/@src') data['header_img'] = 'https' + imgs[0] if len( imgs) != 0 else None data['name'] = self.get_first_element( div.xpath('./div[1]/a[2]/h2/text()')) gender_class = div.xpath('./div[1]/div/@class') if len(gender_class) != 0: data['gender'] = re.findall('articleGender (.+?)Icon', gender_class[0])[0] data['content'] = ''.join([ text.strip() for text in div.xpath('./a/div/span//text()') ]) data['vote'] = self.get_first_element( div.xpath('./div[2]/span[1]/i/text()')) data['comments'] = self.get_first_element( div.xpath('./div[2]/span[2]/a/i/text()')) data_list.append(data) # 把数据数据,添加数据队列中 self.data_queue.put(data_list) # 页面任务处理完毕了 self.page_queue.task_done() def get_first_element(self, lis): return lis[0].strip() if len(lis) != 0 else None def save_data(self): """保存数据""" while True: data_list = self.data_queue.get() with open('糗百_多进程版.jsonlines', 'a', encoding='utf8') as f: for data in data_list: json.dump(data, f, ensure_ascii=False) f.write('\n') # 数据任务完成了 self.data_queue.task_done() def execute_task(self, task, count): """ 执行线程任务 :param task: 任务函数 :param count: 启动线程个数 """ for i in range(count): t = Process(target=task) t.daemon = True t.start() def run(self): self.execute_task(self.add_url_to_queue, 1) self.execute_task(self.add_page_to_queue, 2) self.execute_task(self.add_date_to_queue, 2) self.execute_task(self.save_data, 2) # 如果你执行过程中, 程序很快就结束了, 此时你需要再这里稍微等等 time.sleep(1) # 让主线等待任务队列的完成 self.url_queue.join() self.page_queue.join() self.data_queue.join()
class ImagesBatcher(AbstractDataBatcher): def __init__( self, queue_size, batch_size, data_sampler, image_processor=None, audio_processor=None, single_epoch=False, cache_data=False, # TODO: implement me! disk_reader_process_num=1): """ Class for creating sequence of data batches for training or validation. :param queue_size: queue size for Batch readers :param batch_size: size of batches generated :param dataset_parser: dataset structure-related parser with all images and labels :param image_processor: image reading and preprocessing routine :param data_sampler: knows how to sample batches from dataset :param single_epoch: if enabled, image batcher finish one epoch with None batch :param cache_data: do we need to store all data in batcher memory? :param disk_reader_process_num: how many disk readers do we need? """ super(AbstractDataBatcher, self).__init__() # set parameters self.batch_size = batch_size self.epoch_is_finished = False self.batch_queue_balance = 0 if single_epoch: self.sampler_external_info = type('sampler_external_info', (object, ), dict(single_epoch=True)) else: self.sampler_external_info = None # parse given dataset and init data sampler self.data_sampler = data_sampler # set queues if queue_size == -1: queue_size = self.data_sampler.dataset_size() / self.batch_size + 1 self.task_queue = JoinableQueue(queue_size) self.batch_queue = JoinableQueue(queue_size) # init batch disk readers and start they self.data_readers = [] print('disk_reader_process_num:', disk_reader_process_num) for i in range(disk_reader_process_num): self.data_readers.append( (BatchDiskReader(self.task_queue, self.batch_queue, image_processor, audio_processor))) def start(self): self.epoch_is_finished = False # start batch disk readers for reader in self.data_readers: reader.start() # fill task queue with batches to start async reading from disk self.fill_task_queue() def fill_task_queue(self): try: while True: if not self.task_queue.full(): batch = self.data_sampler.sampling( self.batch_size, self.sampler_external_info) if batch is not None: self.task_queue.put_nowait(batch) self.batch_queue_balance += 1 else: self.epoch_is_finished = True break else: break except Exception as e: #Queue.Full: logger.error("ImagesBatcher: ", e) def next_batch(self): """ Returns next batch from data """ if self.epoch_is_finished and self.batch_queue_balance == 0: self.epoch_is_finished = False self.fill_task_queue() return None batch = self.batch_queue.get(block=True) self.batch_queue.task_done() self.batch_queue_balance -= 1 if not self.epoch_is_finished: # fill task queue self.fill_task_queue() return batch def update_sampler(self, target, logits, step, summary_writer): if hasattr(self.data_sampler, 'update'): labels = target.cpu().data.numpy() is_update_sampler = self.data_sampler.update( labels, logits, step, summary_writer) #if is_update_sampler: # self.clear_queue() def clear_queue(self): try: while True: self.task_queue.get_nowait() self.task_queue.task_done() except Exception as e: pass try: while True: self.batch_queue.get_nowait() self.batch_queue.task_done() except Exception as e: pass self.fill_task_queue() def finish(self): for data_reader in self.data_readers: data_reader.deactivate() while not self.task_queue.empty(): self.task_queue.get() self.task_queue.task_done() is_anybody_alive = [ data_reader.is_alive() for data_reader in self.data_readers ].count(True) > 0 while not self.batch_queue.empty() or is_anybody_alive: try: self.batch_queue.get(timeout=1) self.batch_queue.task_done() is_anybody_alive = [ data_reader.is_alive() for data_reader in self.data_readers ].count(True) > 0 except Exception as e: pass self.task_queue.join() self.batch_queue.join() for data_reader in self.data_readers: data_reader.join()
def fit_for_all(drop_non_countries=False): """Main function to perform fit for all countries.""" #################################################################### # Read files train_df = pd.read_csv(TRAIN_FILE, encoding='cp1252', index_col='Country Name').dropna(axis=0) test_df = pd.read_csv(TEST_FILE, encoding='cp1252', index_col='Country Name').dropna(axis=0) # The test_df has one extra country. Line up train and test. test_df = test_df.loc[train_df.index] if drop_non_countries: train_df = train_df.drop(NON_COUNTRIES) test_df = test_df.drop(NON_COUNTRIES) # Get matrices. train_mat = train_df.values.T.astype(int) test_mat = test_df.values.T.astype(int) # Grab list and number of countries for convenience. countries = train_df.index.values num_countries = countries.shape[0] # Initialize queues for parallel processing. queue_in = JoinableQueue() queue_out = Queue() # Start processes. processes = [] for i in range(NUM_PROCESSES): p = Process(target=fit_for_country_worker, args=(train_mat, test_mat, queue_in, queue_out)) p.start() processes.append(p) # Loop over all the countries (columns of the train matrix). for i in range(num_countries): # Put boolean array in the queue. queue_in.put((i, num_countries)) # Wait for processing to finish. queue_in.join() # Track coefficients. best_coeff = pd.DataFrame(0.0, columns=countries, index=countries) # Track training scores. best_scores = pd.Series(0.0, index=countries) # Track predictions. predictions = pd.DataFrame(0.0, columns=test_df.columns, index=countries) # Map data. for _ in range(num_countries): # Grab data from the queue. other_countries, s, c, p = queue_out.get() country = countries[~other_countries][0] # Map. best_scores.loc[~other_countries] = s best_coeff.loc[other_countries, country] = c # p needs to be transformed (17x1 vs 1x17) predictions.loc[~other_countries, :] = p.T # Shut down processes. for p in processes: queue_in.put_nowait(None) p.terminate() predictions.transpose().to_csv(PRED_OUT, index_label='Id', encoding='cp1252') best_coeff.to_csv(COEFF_OUT, encoding='cp1252') # Print MSE print('Summary of MSE:') print(best_scores.describe())
class EWProtocol(BaseProtocol): """ Base class that contains shared functionality between the two proxy's comm protocols Data sent over is buffered and lz4 compressed """ def __init__(self, factory, buff_class, handle_direction, other_factory, buffer_wait): """ Protocol args: factory: factory that made this protocol (subclass of EWFactory) other_factory: the other factory that communicates with this protocol (in this case an instance of MCProtocol) buffer_wait: amount of time to wait before sending buffered packets (in ms) """ super().__init__(factory, buff_class, handle_direction, other_factory) self.buffer_wait = buffer_wait self.compressor_input_queue = JoinableQueue() self.compressor_output_queue = JoinableQueue() self.depressor_input_queue = JoinableQueue() self.depressor_output_queue = JoinableQueue() self.compression_handler = OutboxHandlerThread(self.compressor_output_queue, reactor.callFromThread, self.send_data) self.decompression_handler = OutboxHandlerThread(self.depressor_output_queue, reactor.callFromThread, super().dataReceived) self.compressors = [] for x in range(COMP_THREADS): self.compressors.append(Compressor(self.compressor_input_queue, self.compressor_output_queue)) self.depressors = [] for x in range(DEP_THREADS): self.depressors.append(Depressor(self.depressor_input_queue, self.depressor_output_queue)) def connectionMade(self): """ Called when a connection is made """ super().connectionMade() if self.factory.instance: # Only one protocol can exist self.transport.loseConnection() return self.factory.instance = self # Start compressor and depressor for x in self.compressors: x.start() for x in self.depressors: x.start() # Start handlers self.compression_handler.start() self.decompression_handler.start() # Run self.send_buffered_packets every self.buffer_wait ms reactor.callLater(self.buffer_wait/1000, self.send_buffered_packets) def connectionLost(self, reason): super().connectionLost(reason) # Remove factory instance self.factory.instance = None # Stop compressor and decompressor for x in self.compressors: x.terminate() for x in self.depressors: x.terminate() try: self.compression_handler.kill() self.decompression_handler.kill() except: pass # Stop handlers self.compression_handler.running = False self.decompression_handler.running = False self.compression_handler.join() self.decompression_handler.join() def dataReceived(self, data): """ Called by twisted when data is received over tcp by the protocol """ self.depressor_input_queue.put_nowait(data) def get_packet_name(self, id): """ Get packet name from id Meant to be overriden Args: id: id of the packet Returns: name: name of the packet """ try: info = packet_names[id] except KeyError: self.logger.error("No packet with id: {}".format(id)) raise KeyError if self.handle_direction not in info[1]: self.logger.error("Wrong direction for packet id: {}".format(id)) raise KeyError return info[0] def get_packet_id(self, name): """ Get packet name from id Meant to be overriden Args: name: name of the packet Returns: id: id of the packet """ try: info = packet_ids[name] except KeyError: self.logger.error("No packet with name: {}".format(name)) raise KeyError if self.send_direction not in info[1]: self.logger.error("Wrong direction for packet name: {}".format(name)) raise KeyError return info[0] def send_packet(self, name, *data): """ Sends a ew packet to the other proxy """ data = b"".join(data) # Combine data data = self.buff_class.pack_varint(self.get_packet_id(name)) + data # Prepend packet ID data = self.buff_class.pack_packet(data) # Pack data as a packet self.compressor_input_queue.put_nowait(data) def send_data(self, data): """ Callback for compressor """ self.transport.write(data) def send_buffered_packets(self): """ Sends all packets in self.input_buffer to the other proxy as a poem """ # Schedule the next call reactor.callLater(self.buffer_wait/1000, self.send_buffered_packets) if len(self.factory.input_buffer) < 1: # Do not send empty packets return data = [] for i in range(len(self.factory.input_buffer)): # Per packet info uuid, packet_name, packet_data = self.factory.input_buffer.popleft() buff = packet_data.buff # We don't use read because we need the entire buffer's data data.append(self.buff_class.pack_uuid(uuid)) # Pack uuid of client # TODO: Pass the id instead of the string name to save bandwidth? buff = self.buff_class.pack_string(packet_name) + buff # Prepend packet name to buffer data.append(self.buff_class.pack_packet(buff)) # Append buffer as packet packet_data.discard() # Buffer is no longer needed # Send poem self.send_packet("poem", *data) def packet_poem(self, buff): """ Parses the poem and dispatches callouts with packet_mc_* callbacks Also forwards the packets afterwards """ data = [] try: while True: # Unpack data until a bufferunderrun uuid = buff.unpack_uuid() packet = buff.unpack_packet(self.buff_class) # Packet is unpacked here as the subclass will just forward it packet_name = packet.unpack_string() packet.save() data.append((uuid, packet_name, packet)) except BufferUnderrun: pass buff.discard() # Discard when done # Dispatch calls for packet in data: try: new_packet = self.dispatch(("mc", packet[1]), packet[0], packet[2]) except BufferUnderrun: self.logger.info("Packet is too short: {}".format(packet[1])) continue # If nothing was returned, the packet should be sent as it was originally if not new_packet: new_packet = packet # Forward packet if new_packet[2] != None: # If the buffer is none, it was explictly stated to not send the packet! try: self.other_factory.get_client(new_packet[0]).send_packet(new_packet[1], new_packet[2].buff) except KeyError: # The client has disconnected already, ignore pass
parser.add_argument('-n', '--num-processes', help="Number of subprocesses to start", default=4, type=int) parser.add_argument('-b', '--batch-size', help="Number of entries to batch prior to transmission", default=100, type=int) parser.add_argument('-a', '--after', help="Only include nodes published on or after this year") parser.add_argument('-bf', '--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() file_queue = JoinableQueue() result_queue = JoinableQueue() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") for file in arguments.infile: file_queue.put_nowait(file) for i in range(arguments.num_processes): file_queue.put_nowait('STOP') for i in range(arguments.num_processes): Process(target=wos_parser, args=(file_queue, result_queue, arguments.wos_only, arguments.sample_rate, arguments.must_cite, arguments.batch_size, date_after)).start() Process(target=pjk_writer, args=(result_queue, arguments.outfile, arguments.benchmark_freq)).start()
def main(factor=2): #E.G: if total cores is 2 , no of processes to be spawned is 2 * factor files_to_download = JoinableQueue() result_queue = JoinableQueue() time_taken = JoinableQueue() time_taken_to_read_from_queue = JoinableQueue() with open('downloads.txt', 'r') as f: for to_download in f: files_to_download.put_nowait(to_download.split('\n')[0]) files_to_download_size = files_to_download.qsize() cores = cpu_count() no_of_processes = cores * factor for i in xrange(no_of_processes): files_to_download.put_nowait(None) jobs = [] start = datetime.datetime.now() for name in xrange(no_of_processes): p = Process(target = download, args = (files_to_download, result_queue,\ time_taken, time_taken_to_read_from_queue,name)) p.start() jobs.append(p) for job in jobs: job.join() print result_queue.qsize() total_downloaded_urls = 0 try: while 1: r = result_queue.get_nowait() total_downloaded_urls += r except Empty: pass try: while 1: """ locals() keeps track of all variable, functions, class etc. datetime object is different from int, one cannot perform 0 + datetime.datetime.now(), if when we access the queue which contains time objects first time, total_time will be set to first time """ if 'total_time' in locals(): total_time += time_taken.get_nowait() else: total_time = time_taken.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to download {3}\ urls" .format(no_of_processes, cores, total_time, \ total_downloaded_urls)) try: while 1: if 'queue_reading_time' in locals(): queue_reading_time += time_taken_to_read_from_queue.get_nowait( ) else: queue_reading_time = time_taken_to_read_from_queue.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to read {3}\ urls from queue" .format(no_of_processes, cores,queue_reading_time\ ,files_to_download_size))
class CapturePlugin(BaseDronePlugin): def __init__(self, interfaces, channel, drone): BaseDronePlugin.__init__(self, interfaces, channel, drone, 'CapturePlugin.{0}'.format(channel)) self.logutil.log('Initializing') # Select interface try: self.kb = self.interfaces[0] self.kb.set_channel(self.channel) self.kb.active = True except Exception as e: print("failed to use interface") self.status = False # Pipe from the tasker to the filter module, used to send pickled tasking dictionaries (simple DictManager) recv_pconn, recv_cconn = Pipe() task_pconn, self.task_cconn = Pipe() self.task_queue = JoinableQueue() # Start the filter up self.p_filt = FilterProcess(recv_pconn, self.task_queue, self.done_event, self.task_update_event, self.drone, self.name) self.p_filt.start() self.logutil.log('Launched FilterProcess ({0})'.format( self.p_filt.pid)) self.childprocesses.append(self.p_filt) # Start the receiver up self.p_recv = SnifferProcess(recv_cconn, self.kb, self.done_event, self.drone, self.name) self.p_recv.start() self.logutil.log('Launched SnifferProcess: ({0})'.format( self.p_recv.pid)) self.childprocesses.append(self.p_recv) def task(self, uuid, data): self.logutil.log('Adding Task: {0}'.format(uuid)) if uuid in self.tasks: return False self.tasks[uuid] = data self.__update_filter_tasking() return True def detask(self, uuid): res = None if uuid in self.tasks: res = self.tasks.get(uuid) del self.tasks[uuid] else: return False if len(self.tasks) == 0: # Time to shut the whole party down, as we don't have any more tasks self.logutil.log('No remaining tasks, shutting down plugin') self.shutdown() #TODO return something to indicate a total shutdown also else: # We made a change to tasking, let's implement it self.__update_filter_tasking() #return res return True def __update_filter_tasking(self): self.logutil.log('Sending Task Updates to FilterProcess') self.task_queue.put_nowait(cPickle.dumps(self.tasks))