def cli(): """Preprocess the samples for BDT optimization. """ # Load the configuration module. config = load_config() # Create the output directory. safe_makedirs('sample') # Preprocess the samples in parallel. To guard against deadlock, the number # of workers is chosen to be the smaller of the number of available cores # or the number of samples to preprocess. samples = config.SIGNAL + config.BACKGROUND max_workers = min(multiprocessing.cpu_count(), len(samples)) tasks = [] with futures.ProcessPoolExecutor(max_workers) as executor: for sample in samples: # The configuration module cannot be pickled, so pass the options directly. tasks.append( executor.submit(worker, sample, config.DIRECTORY, config.SELECTION, config.BRANCHES, config.TARGET_LUMI)) with click.progressbar(label='Preprocessing Samples', length=len(samples), show_pos=True, show_percent=False) as bar: for task in futures.as_completed(tasks): bar.update(1)
def proxy_checker(proxies): ''' proxies is a list of {key:value}, where the key is the ip of the proxy (including port), e.g., 192.168.1.1:8080, and the value is the type of the proxy (http/https) ''' logger.info('%d proxies to check'%(len(proxies))) import multiprocessing as mp results = [] with futures.ProcessPoolExecutor(max_workers=mp.cpu_count()*10) as executor: future_to_proxy = {executor.submit(check_proxy, proxy, 30): proxy for proxy in proxies if proxy.values()[0] == 'http'} for future in future_to_proxy: future.add_done_callback(lambda f: results.append(f.result())) logger.info('%d http proxies to check'%(len(future_to_proxy))) futures.wait(future_to_proxy) # for future in futures.as_completed(future_to_proxy): # proxy = future_to_proxy[future] # try: # good, proxy_dict = future.result() # except Exception as exc: # logger.info('%r generated an exception: %s'%(proxy, exc)) # else: # if (good): # good_proxies.append(proxy_dict) return [p for (good, p) in results if good]
def grade(model, generation, pop, x_train, y_train, x_valid, y_valid, target, do_parallel=False): results = np.zeros(len(pop)) print 'grade:', chromosomes.keys() if do_parallel: # pickle is having error during multi process. with futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor: for i, individual in enumerate(pop): f = executor.submit(fitness_, model, i, individual, x_train, y_train, x_valid, y_valid, target) index, score = f.result() results[index] = score else: for i, individual in enumerate(pop): results[i] = fitness(model, individual, x_train, y_train, x_valid, y_valid, target) mean_ = np.mean(results) print_grade(generation, results, pop) return mean_
def flush(self, bucket): with futures.ProcessPoolExecutor(max_workers=1) as executor: # for each bucket it's a dict, # where the key needs to be the file name; # and the value is a list of json encoded value for bucket, items in self.buffer.iteritems(): if len(items) > 0: f = executor.submit(flush_bucket, bucket, items) # send to a different process to operate, clear the buffer self.clear(bucket) #flush_bucket(bucket, items) # self.clear(bucket) self.futures.append(f) return True
def json2agreementmatrix(jsonflist,start=2,maxlen=0,task_type='all'): """ Multi process function to convert 2 json file annotation combination to agreement values (alpha,kappa,Avg Observed agreement) Args: jsonflist (list): list of json filenames. start (int): combination group size to begin with. maxlen(int): maximum count starting from :data:'start' Kwargs: state (bool): Current state to be in. Returns: A dict mapping annotator combination to agreement values then pickled, yamled and csved. Raises: Future.Exception """ future_list=[] detaildata={} flen=len(jsonflist) assert start+maxlen-2<=flen with futures.ProcessPoolExecutor() as executor: for cnt in range(start,start+maxlen+1): for tpl in list(itertools.combinations(jsonflist,cnt)): future_list.append(executor.submit(getagreement,tpl,os.path.dirname(jsonflist[0]),task_type)) for future in futures.as_completed(future_list): if future.exception() is not None: print('%r generated an exception: %s' % (future, future.exception())) else: detaildata.update( future.result()) yaml.dump(detaildata,open(os.path.dirname(jsonflist[0])+'\\'+str(start)+'-'+str(start+maxlen)+'out.yaml','w')) csvdump(detaildata,open(os.path.dirname(jsonflist[0])+'\\'+str(start)+'-'+str(start+maxlen)+'out.csv','w')) print "Dumped output" return detaildata
def flush(self, bucket): logger.debug("i'm getting flushed...") with futures.ProcessPoolExecutor(max_workers=1) as executor: for k, v in self.buffer[bucket].iteritems(): for s in v: o = json.loads(s) f = executor.submit(flush_cmd, o[self.data_type], self.data_type, self.template, self.redis_config) self.futures.append(f) # send to a different process to operate, clear the buffer self.clear(bucket) return True
def get_perspectives(url): '''Get different perspectives on the topic covered by article. Args: url: A string. Returns: A JSON-encoded string representing other articles with different perspectives than the original article. Format: a list of Article.to_dict()s, each with an additional 'sentences' attribute. 'sentences' contains a list of sentences with semantically different words that were extracted from the corresponding article's body. ''' article = url_to_article(url) if article: headline = article.headline body = article.body org = article.news_org article_topic = extract_keywords.extract_keywords(headline) (NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets) = \ get_article_phrases(body, org) n = len(NEWS_ORGS) with futures.ProcessPoolExecutor(max_workers=n) as executor: comparisons = executor.map(get_comparison, NEWS_ORGS, [article_topic] * n, [NP_to_sentence] * n, [VP_to_sentence] * n, [NPs] * n, [VPs] * n, [NP_synsets] * n, [VP_synsets] * n, [1] * n) compared_articles_by_org = list(comparisons) # flatten from list of lists of articles (separated by news org) to list # of articles compared_articles = [ article for org_articles in compared_articles_by_org for article in org_articles ] return json.dumps(compared_articles) else: return json.dumps("Not a recognized article")
def run(socket, channels, cmds, nick, logfile): # buffer for some command received buff = '' num_workers = sum(len(v) for k, v in cmds.iteritems()) #TODO: what happens if I use all the workers? #TODO: don't let commands to run for more than one minute with futures.ProcessPoolExecutor(max_workers=num_workers) as executor: while len(channels): receive = socket.recv(4096) buff = buff + receive response = '' if receive: log_write(logfile, get_datetime()['time'], ' <> ', receive + \ ('' if '\n' == receive[len(receive)-1] else '\n')) if -1 != buff.find('\n'): # get a full command from the buffer command = buff[0:buff.find('\n')] buff = buff[buff.find('\n') + 1:] # command's components after parsing components = parser.parse_command(command) to = send_to(command) if 'PING' == components['action']: response = [] response.append('PONG') response.append(':' + components['arguments']) elif 'PRIVMSG' == components['action']: if '!' == components['arguments'][0]: # a command from a user only makes sense if it starts # with an exclamation mark pos = components['arguments'].find(' ') if -1 == pos: pos = len(components['arguments']) # get the command issued to the bot without the "!" cmd = components['arguments'][1:pos] callable_cmd = get_cmd(cmd, cmds['user'], logfile) if callable_cmd: run_cmd(socket, executor, to, callable_cmd, components, logfile) else: callable_cmd = get_cmd(cmd, cmds['core'], logfile) if callable_cmd: try: response = callable_cmd(socket, components) except Exception as e: response = err.C_EXCEPTION.format( callable_cmd.__name__) log_write(logfile, response, ' <> ', str(e) + '\n') # run auto commands for cmd in config.cmds['auto']: callable_cmd = get_cmd(cmd, cmds['auto'], logfile) if callable_cmd: run_cmd(socket, executor, to, callable_cmd, components, logfile) elif 'KICK' == components['action'] and \ nick == components['action_args'][1]: channels.remove(components['action_args'][0]) elif 'QUIT' == components['action'] and \ -1 != components['arguments'].find('Ping timeout: '): channels[:] = [] # this call is still necessary in case that a PONG response or a # core command response should be sent, every other response is # sent when the futures finish working from their respective # thread send_response(response, to, socket, logfile) buff = ''
V.add_plot({'type': 'raster', 'ids': {0: neu_pub}, #'yticks': range(1, 1+len(neu_out)), #'yticklabels': range(len(neu_out)) }, 'Generic LPU %s' % i, 'Output') V._update_interval = 50 V.rows = 3 V.cols = 1 V.fontsize = 18 V.out_filename = 'generic_output_%s.avi' % out_name V.codec = 'libtheora' V.dt = 0.0001 V.xlim = [0, 1.0] V.run() # Run the visualizations in parallel: with futures.ProcessPoolExecutor() as executor: fs_dict = {} for out_name in ['un', 'co']: res = executor.submit(run, out_name) fs_dict[out_name] = res futures.wait(fs_dict.values()) # Report any exceptions that may have occurred: for k in fs_dict: e = fs_dict[k].exception() if e: print '%s: %s' % (k, e)
def start_server(config, proxies): import copy check_config(config) config = copy.copy(config) folders_to_create = [] buckets = [ "tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines" ] ouput_folder = os.path.abspath(config['output']) archive_output = os.path.abspath( config['archive_output']) if config['archive_output'] else ouput_folder archive_output = os.path.join(archive_output, 'archived') folders_to_create.append(ouput_folder) folders_to_create.append(archive_output) for bucket in buckets: folders_to_create.append(os.path.join(ouput_folder, bucket)) folders_to_create.append(os.path.join(archive_output, bucket)) for folder_to_create in folders_to_create: if (not os.path.exists(folder_to_create)): os.makedirs(folder_to_create) logger.info("output to %s" % (ouput_folder)) logger.info("archived to %s" % (archive_output)) this_node_id = node_id() node_queue = NodeQueue(this_node_id, redis_config=config['redis_config']) node_queue.clear() scheduler = Scheduler(this_node_id, config=config, proxies=proxies) logger.info('starting node_id: %s' % this_node_id) node_coordinator = NodeCoordinator(config['redis_config']) #node_coordinator.clear() #the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler; #but we need one to report the status of each crawler and perform the tarball tashs... last_archive_ts = time.time( ) + 3600 # the first archive event starts 2 hrs later... pre_time = time.time() last_load_balancing_task_ts = time.time() while True: if (time.time() - pre_time > 120): logger.info(pprint.pformat(scheduler.crawler_status())) pre_time = time.time() if (scheduler.is_alive()): cmd = {'cmd': 'CRAWLER_FLUSH'} scheduler.enqueue(cmd) if (time.time() - last_archive_ts > 3600): logger.info("start archive procedure...") with futures.ProcessPoolExecutor( max_workers=len(buckets)) as executor: future_proxies = { executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets } for future in future_proxies: future.add_done_callback(lambda f: logger.info( "archive created? %s: [%s]" % f.result())) last_archive_ts = time.time() # block, the main process...for a command if (not scheduler.is_alive()): logger.info( "no crawler is alive... waiting to recreate all crawlers...") time.sleep(120) # sleep for a minute and retry continue if (time.time() - last_load_balancing_task_ts > 1800): # try to balance the local queues every 30 mins last_load_balancing_task_ts = time.time() cmd = {'cmd': 'BALANCING_LOAD'} scheduler.enqueue(cmd) cmd = node_queue.get(block=True, timeout=360) if cmd: scheduler.enqueue(cmd)
def main(): with futures.ProcessPoolExecutor(max_workers=3) as executor: list(executor.map(worker, range(10)))
async def startup(ctx): ctx['pool'] = futures.ProcessPoolExecutor()