def _itergroundings(self, simplify=False, unsatfailure=False): global global_bpll_grounding global_bpll_grounding = self if self.multicore: pool = Pool(maxtasksperchild=1) try: for gndresult in pool.imap(with_tracing(create_formula_groundings), self.formulas): for fidx, stat in gndresult: for (varidx, validx, val) in stat: self._varidx2fidx[varidx].add(fidx) self._addstat(fidx, varidx, validx, val) checkmem() yield None except CtrlCException as e: pool.terminate() raise e pool.close() pool.join() else: for gndresult in imap(create_formula_groundings, self.formulas): for fidx, stat in gndresult: for (varidx, validx, val) in stat: self._varidx2fidx[varidx].add(fidx) self._addstat(fidx, varidx, validx, val) yield None
def work(host, port, processes, threads, times): pool = Pool(processes, lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) p = Process(target=progress) p.daemon = True start = time.time() try: for chunk in divide(times, processes): pool.apply_async(thread, (host, port, threads, chunk)) p.start() pool.close() pool.join() p.terminate() p.join() except KeyboardInterrupt: pool.terminate() p.terminate() p.join() pool.join() return time.time() - start
def main(datadir, convert_dir, crop_size): try: os.mkdir(convert_dir) except OSError: pass filenames = data_util.get_image_files(datadir) print('Resizing images in {} to {}'.format(datadir, convert_dir)) n = len(filenames) batch_size = 500 batches = n // batch_size + 1 p = Pool() args = [] for f in filenames: args.append((convert_size, (datadir, convert_dir, f, crop_size))) for i in range(batches): print('batch {:>2} / {}'.format(i + 1, batches)) p.map(convert, args[i * batch_size : (i + 1) * batch_size]) p.close() p.join() print('Done')
def _itergroundings(self, simplify=True, unsatfailure=True): # generate all groundings if not self.formulas: return global global_fastConjGrounding global_fastConjGrounding = self batches = list(rndbatches(self.formulas, 20)) batchsizes = [len(b) for b in batches] if self.verbose: bar = ProgressBar(width=100, steps=sum(batchsizes), color='green') i = 0 if self.multicore: pool = Pool() try: for gfs in pool.imap(with_tracing(create_formula_groundings), batches): if self.verbose: bar.inc(batchsizes[i]) bar.label(str(cumsum(batchsizes, i + 1))) i += 1 for gf in gfs: yield gf except Exception as e: logger.error('Error in child process. Terminating pool...') pool.close() raise e finally: pool.terminate() pool.join() else: for gfs in imap(create_formula_groundings, batches): if self.verbose: bar.inc(batchsizes[i]) bar.label(str(cumsum(batchsizes, i + 1))) i += 1 for gf in gfs: yield gf
def start(self): """Starts a server that controls local workers. Calling this function starts a pool of `num_workers` workers used to run targets sent to the server. The server will run indefinitely unless shut down by the user. """ try: serv = Listener((self.hostname, self.port)) workers = Pool( processes=self.num_workers, initializer=Worker, initargs=(self.status, self.queue, self.waiting), ) logging.info( "Started %s workers, listening on port %s", self.num_workers, serv.address[1], ) self.wait_for_clients(serv) except OSError as e: if e.errno == 48: raise ServerError( ( "Could not start workers listening on port {}. " "The port may already be in use." ).format(self.port) ) except KeyboardInterrupt: logging.info("Shutting down...") workers.close() workers.join() self.manager.shutdown()
class Pool(object): ''' ''' def __init__(self, **pool_kwargs): try: kw = KwargsCheck(MPIPool, pool_kwargs) self._pool = MPIPool(**kw) self.MPI = True except (ImportError, ValueError): kw = KwargsCheck(MultiPool, pool_kwargs) self._pool = MultiPool(**kw) self.MPI = False if self.MPI: if not self._pool.is_master(): self._pool.wait() sys.exit(0) def map(self, f, x, args = (), kwargs = {}): ''' ''' if len(args) or len(kwargs): w = wrap(f, *args, **kwargs) return self._pool.map(w, x) else: return self._pool.map(f, x) def close(self): self._pool.close()
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000): print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath) all_labels = [] label_dict = {} filenames_chunks = util.chunks(filenames, chunk_size) for i, chunk in enumerate(filenames_chunks): pool = Pool(processes=util.CPU_COUNT) chunk_labels = pool.map(extract_labels, chunk) pool.close() for filepath, labels in zip(chunk, chunk_labels): if labels is not None: file_id = util.filename_without_extension(filepath) label_dict[file_id] = labels all_labels += labels print i+1, '/', len(filenames_chunks) #Write labels to file with open(out_filepath,'w') as f: pickle.dump(label_dict, f) print '\nLabels:' print len(set(all_labels)) print Counter(all_labels)
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)] if limit: limit_per_pool = (limit // poolsize)+1 else: limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def add_tree(self, iterations=-1, snapshot=False): """ Multi-core, fully utilizes underlying CPU to create the trees of the forest and stores them into the forest's list of trees :param iterations: number of trees to make, -1 means use default setting :return: None """ print("Adding trees:", iterations) if iterations == -1: iterations = self.default_tree_count ######################### # MULTI THREADED ######################## pool = Pool() # creates multiple processes equal to cores in machine outputs = pool.map(make_tree, [(self.data_copy(), self.depthlimit, self.weak_learner) for _ in range(iterations)]) pool.close() pool.join() self.trees.extend(outputs) # get the trees created and store them ######################### # SINGLE THREADED ######################## #for i in range(iterations): # tree = Tree(self.data, self.bagging, self.bag_ratio, self.depthlimit, self.weak_learner) # self.trees.append(tree) # get the trees created and store them if snapshot: self.sum_squares(len(self.trees)) # get error after each snapshot, if this command is run multiple times
def get_correlation_parallel(s1,s2): """ params s1 - series 1 params s2 - series 2 NOTE : series are number 1 to 25 when giving in arguments returns the correlation between series """ start = time.time() offsets = [] #this will be the arguments to all the parallel jobs instances = (MAX_ROWS/BATCH_SIZE) mean,std = calculate_mean_std_parallel() stripped_mean,stripped_std = calculate_stripped_mean_std_parallel(mean,std) processes = Pool(processes=instances) for i in range(instances): offsets.append((s1,s2,mean,std,stripped_mean,stripped_std,i*BATCH_SIZE)) results = processes.map(get_correlation,offsets) processes.close() processes.join() pearson_corr = 0 total = 0 for result in results: pearson_corr += result[0]*result[1] total += result[1] pearson_corr = 1.0*pearson_corr / total t_value = abs(pearson_corr*math.sqrt( 1.0*(total - 2) / ( 1 - (pearson_corr*pearson_corr)))) p_value = t.sf(t_value,total-2) print "\n ######### CORRELATION BETWEEN SERIES ",s1," AND SERIES ",s2, " is ",pearson_corr , "t value is ", t_value ," and p value is ", p_value, "######### \n" end = time.time() print "EXECUTION TIME : ", end-start , " sec" return pearson_corr
def parse(document, pages, parse_refs=True, progress_monitor=NullProgressMonitor(), pool_size=DEFAULT_POOL_SIZE): progress_monitor.start('Parsing Pages', pool_size + 1) # Prepare input pages = [(page.local_url, page.url) for page in pages.values() if page.local_url is not None] pages_chunks = chunk_it(pages, pool_size) inputs = [] for pages_chunk in pages_chunks: inputs.append((document.parser, document.pk, parse_refs, pages_chunk)) # Close connection to allow the new processes to create their own. connection.close() # Split work progress_monitor.info('Sending {0} chunks to worker pool' .format(len(inputs))) pool = Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, 1): progress_monitor.work('Parsed 1/{0} of the pages'.\ format(pool_size), 1) # Word Count word_count = 0 for page in document.pages.all(): word_count += page.word_count document.word_count = word_count document.save() progress_monitor.work('Counted Total Words', 1) pool.close() progress_monitor.done()
def stat_volume(stime,etime): tgsinfo = read_tgs_info() # from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.pool import Pool pool = Pool() volume = [pool.apply_async(stat_tgs_volume,args=(stime,etime,int(cid))) for cid in tgsinfo.keys()] pool.close() print 'waiting to join....' pool.join() print 'start to writing to file...' volume0 = [] for i,elem in enumerate(volume): volume0.append((tgsinfo.keys()[i], elem.get())) volume0.sort(key=lambda x:x[1], reverse=True) total = 0 with open(os.path.join(root_dir, "result", "volume.txt"),"w") as f: for i,elem in enumerate(volume0): # cid = tgsinfo.keys()[i] # vol = elem.get() total += elem[1] line = "%5s,%s: %d\n" % (elem[0], tgsinfo[elem[0]]['kkmc'], elem[1]) f.write(line) print 'totally %d records.' % (total)
class _MultiExecutor(_Executor): """Execute functions async in a process pool""" def __init__(self): super(_MultiExecutor, self).__init__() self._children = 0 self.pool = Pool() def _collector(self, result): super(_MultiExecutor, self)._collector(result) self._children -= 1 def execute(self, func, args): self._children += 1 self.pool.apply_async(func, args, callback=self._collector) def wait_for_results(self): self.pool.close() # One would have hoped joining the pool would take care of this, but # apparently you need to first make sure that all your launched tasks # has returned their results properly, before calling join, or you # risk a deadlock. while self._children > 0: time.sleep(0.001) self.pool.join()
def ingest( dataset, cls, skip_if_exists=True, multi_process=False, multi_threaded=False, cores=None): pool = None if multi_process: pool = Pool(cores or cpu_count()) map_func = pool.imap_unordered elif multi_threaded: pool = ThreadPool(cores or cpu_count()) map_func = pool.imap_unordered else: map_func = map cls_args = repeat(cls) skip_args = repeat(skip_if_exists) map_func(ingest_one, zip(dataset, cls_args, skip_args)) if pool is not None: # if we're ingesting using multiple processes or threads, the processing # should be parallel, but this method should be synchronous from the # caller's perspective pool.close() pool.join()
def main(): global pool pool = Pool(POOL_SIZE) nseeds = 100 # print("== generating seeds...") # generate_seeds(nseeds) #print("running const density experiments...") #run_constant_density(0.1, range(100, 1000, 100), nseeds) #print("running const size experiments...") #run_constant_size(50, range(100, 1000, 100), nseeds) print("== running aggregate interval experiments (const density)...") # run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [100, 500] + list(range(1000, 4000, 1000))) run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000]) reset_pool() run_aggregate_interval_constant_density(0.2, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000]) reset_pool() run_aggregate_interval_constant_density(0.3, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000]) reset_pool() run_aggregate_interval_constant_density(0.4, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000]) reset_pool() run_aggregate_interval_constant_density(0.5, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000]) pool.close() pool.join()
def main(): print('Process (%s) start...' % os.getpid()) p = Pool() for i in range(4): p.apply_async(long_time_task, args=(i,)) print('Waiting for all subprocesses done...') p.close() p.join() print('All subprocesses done.')
def load_images_uint(files): p = Pool() process = imread results = p.map(process, files) p.close() p.join() images = np.array(results) images = images.transpose(0, 3, 1, 2) return images
def get_data(): f2 = open('app_links1.txt','r') nprocs = 500 # nprocs is the number of processes to run ParsePool = Pool(nprocs) #ParsePool.map(btl_test,url) ParsedURLS = ParsePool.map(deatilsExtract,f2) ParsePool.close() ParsePool.join()
def get_word(): domains=open('dic/newwords').readlines() try: pool=Pool(processes=2) pool.map(check_domain,domains) pool.close() pool.join() except Exception as e: print e pass
def main(): queue_logger = setup_redirection() queue_logger.write("ABCDEF\n") try: p = Pool(10) results = [p.apply_async(some_process_body) for i in xrange(20)] [result.get() for result in results] p.close() finally: queue_logger.stop()
def calculate_stripped_mean_std_parallel(mean,std): """ params - mean params - std returns stripped up mean and std """ stripped_mean = [] stripped_squares = [] stripped_std = [] dirty_data = [] outliers = [] for i in range(0,NO_OF_SERIES): stripped_std.append(0) stripped_squares.append(0) stripped_mean.append(0) dirty_data.append(0) outliers.append(0) start = time.time() offsets = [] #this will be the arguments to all the parallel jobs instances = (MAX_ROWS/BATCH_SIZE) processes = Pool(processes=instances) for i in range(instances): offsets.append((mean,std,i*BATCH_SIZE)) results = processes.map(calculate_stripped_mean_std,offsets) processes.close() processes.join() total = 0 for result in results: for i in range(len(result[0])): count = result[2] - result[3][i] #actual - dirty data stripped_mean[i] += result[0][i]*count stripped_squares[i] += result[1][i]*count dirty_data[i] += result[3][i] outliers[i] += result[4][i] total += result[2] for i in range(len(mean)): stripped_mean[i] = 1.0*(stripped_mean[i])/(total - dirty_data[i]) stripped_squares[i] = 1.0*(stripped_squares[i]) / (total - dirty_data[i]) stripped_std[i] = math.sqrt(stripped_squares[i] - (stripped_mean[i]*stripped_mean[i])) end = time.time() print "######### STRIPPED MEAN ######### \n" print stripped_mean print "\n ######### STRIPPED STANDARD DEVIATION ######### \n" print stripped_std print "\n######### NAN ROWS COUNT #########\n" print dirty_data print "\n######### OUTLIERS ROWS COUNT #########\n" print outliers print "\n######### EXECUTION TIME #########\n" print (end-start) return stripped_mean,stripped_std
class TcpController(object): def __init__(self,handlers): self.handlers=handlers self.workers=Pool(MAX_PROCESS_POOL_SIZE) def process(self,header,body): self.workers.apply_async(wrap,(self.handlers[header](),body,)) def destroy(self): self.handlers=None self.workers.close()
def run(self): cases = self.get_test_case() # 定义一个进程池 pool = Pool(processes=len(cases)) result.append(pool.map_async(self.init_driver, cases.values())) pool.close() pool.join() while not q.empty(): comm.Template.set_middle(q.get())
def _get(self, args): draft_id = args[0] id = args[1] if len(args) > 1 else None q = self.db.query(Player) if id is not None: player = q.filter(Player.id == int(id)).first() team = self.db.query(Team).filter(and_(Team.is_owner == True, Team.draft_id == draft_id)).first() available_players = self.db.query(Player).join(Player.core).filter(and_(PlayerCore.rank != None, PlayerCore.target_price != None, PlayerCore.points > 0, Player.draft_id == draft_id, Player.team_id == None, Player.id != player.id)).order_by(PlayerCore.rank).all() min_price = 1 max_price = min(player.core.target_price + 21, team.money) manager = Manager() max_starters_points = manager.dict() max_bench_points = manager.dict() pool = Pool(processes=8) starters, bench = get_starters_and_bench(self.db, team.id) max_starters_points[0] = optimizer.optimize_roster(starters, available_players, team.money - (constants.BENCH_SIZE - len(bench)))[1] for m in range(min_price, 10): pool.apply_async(wrap_optimizer, args=(starters, available_players, team.money - m - (constants.BENCH_SIZE - len(bench)) + 1, max_bench_points, m)) full_starters = True for s in starters: if s is None: full_starters = False if not full_starters: starters_clone = list(starters) bench_clone = list(bench) place_player(player, starters_clone, bench_clone) for m in range(min_price, max_price): pool.apply_async(wrap_optimizer, args=(starters_clone, available_players, team.money - m - (constants.BENCH_SIZE - len(bench_clone)), max_starters_points, m)) pool.close() pool.join() ret = player.to_dict(['core']) ret['max_starters_points'] = dict(max_starters_points) ret['max_bench_points'] = dict(max_bench_points) return ret else: players = q.join(PlayerCore).filter(and_(Player.draft_id == int(draft_id), PlayerCore.rank != None, PlayerCore.target_price != None)).all() return {'players': [p.to_dict(['core']) for p in players]}
def parallel_augment(images, normalize=None, test=False): if normalize is not None: mean, std = normalize images = images - mean[:, np.newaxis, np.newaxis] # assuming channel-wise normalization images = images / std[:, np.newaxis, np.newaxis] p = Pool() process = partial(augment, test=test) results = p.map(process, images) p.close() p.join() augmented_images = np.array(results, dtype=np.float32) return augmented_images
def calculate_mean_std_parallel(): """ call this function to compute the mean, standard deviation and NaNs for each seies the file name, no of jobs can be changed in the settings file """ start = time.time() offsets = [] instances = (MAX_ROWS/BATCH_SIZE) processes = Pool(processes=instances) for i in range(instances): offsets.append(i*BATCH_SIZE) print offsets result = processes.map(calculate_mean_std,offsets) processes.close() processes.join() mean = [] std = [] squares = [] dirty_data = [] #initializing for i in range(0,NO_OF_SERIES): mean.append(0) std.append(0) squares.append(0) dirty_data.append(0) total = 0 ### here we combine the results from different processes / threads for r in result: for i in range(len(r[0])): ### update for each time series count = (r[2] - r[3][i]) ### actual count - the count with missing value mean[i] += r[0][i]*count squares[i] += r[1][i]*count dirty_data[i] += r[3][i] total += r[2] for i in range(len(mean)): mean[i] = 1.0*(mean[i])/(total - dirty_data[i]) squares[i] = 1.0*(squares[i]) / (total - dirty_data[i]) std[i] = math.sqrt(squares[i] - (mean[i]*mean[i])) end = time.time() print "######### MEAN ######### \n" print mean print "\n ######### STANDARD DEVIATION ######### \n" print std print "\n######### NAN ROWS COUNT #########\n" print dirty_data print "\n######### EXECUTION TIME #########\n" print (end-start) return mean,std
def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER): print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder) #Zip the filename input with the output folder tuple_input = zip(filenames, [out_folder]*len(filenames)) pool = Pool(processes=util.CPU_COUNT) #pool = Pool(processes=1) num_tasks = len(filenames) for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1): sys.stderr.write('\rdone {0:%}'.format(i/num_tasks)) pool.close() print "\nDONE"
def main(): """ Build all the models. Spin off a new process for each participant because the ANN library is not multithreaded. Process is used instead of thread to leverage multiple cores. """ parser = ArgumentParser() parser.add_argument("inputFilename") parser.add_argument("outputDirectory") args = parser.parse_args() inputFilename = args.inputFilename outputDirectory = args.outputDirectory data = pickle.load( open(inputFilename, 'rb') ) tasks = [ 'matb', 'rantask' ] participantIds = [ '001', '002', '003', '004', '005', '006', '007' ] # Cut off first row header for each data set for task in tasks: for participantId in participantIds: data[participantId][task] = data[participantId][task][1:] splits = performSplit( data ) # Record start time so that the elapsed time can be determined start_time = time.time() # Create a multicore processing pool with 7 processes ( 7 so that one core stays free # for system processes ) pool = Pool( processes = 7 ) # Build models for participants in a task for task in tasks: for participantId in participantIds: outputFilename = path.join( outputDirectory, 'testingOn-' + participantId + '-' + task + '.txt' ) # Spin off a process for the building pool.apply_async( tuneANN, ( splits[participantId][task], outputFilename ) ) # Close down the pool so that we can wait on all the processes pool.close() pool.join() # Calculate and print the elapsed time elapsed_time = time.time() - start_time print( "Elapsed time: " + str(elapsed_time) )
def main(directory, convert_directory, test, crop_size, extension): try: os.mkdir(convert_directory) except OSError: pass filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory) for f in fn if f.endswith('jpeg') or f.endswith('tiff')] filenames = sorted(filenames) if test: names = data.get_names(filenames) y = data.get_labels(names) for f, level in zip(filenames, y): if level == 1: try: img = convert(f, crop_size) img.show() Image.open(f).show() real_raw_input = vars(__builtins__).get('raw_input',input) real_raw_input('enter for next') except KeyboardInterrupt: exit(0) print("Resizing images in {} to {}, this takes a while." "".format(directory, convert_directory)) n = len(filenames) # process in batches, sometimes weird things happen with Pool on my machine batchsize = 500 batches = n // batchsize + 1 pool = Pool(N_PROC) args = [] for f in filenames: args.append((convert, (directory, convert_directory, f, crop_size, extension))) for i in range(batches): print("batch {:>2} / {}".format(i + 1, batches)) pool.map(process, args[i * batchsize: (i + 1) * batchsize]) pool.close() print('done')
def multi_proc5(self, batch): start_time = datetime.datetime.now() sql = "select count(id) from records" count_result = db_connection.execute(sql) for row in count_result: count = row[0] break sql = "select id from records" result = db_connection.execute(sql) record_ids = [] for idx, row in enumerate(result): if (idx % int(count/4) == 0) or (idx == count - 1): #4 because that is how many workers we have if idx == 0: some_records = [] else: record_ids.append(some_records) some_records = [] some_records.append(row[0]) input_pool = Pool(4) #Add id messages to input queue input_pool.map(partial(add_batch_ids_to_queue, batch_size=int(batch)), record_ids) input_pool.close() input_pool.join() output_pool = Pool(4) #Read ids from input_queue, read message from DB and write it to output_queue worker_results = [] for i in range(4): worker_results.append(output_pool.apply_async(read_id_from_queue, ())) output_pool.close() for r in worker_results: r.get() # This reports results, including errors, of workers output_pool.join() # This blocks until all the processes have finished end_time = datetime.datetime.now() time_taken = (end_time - start_time).total_seconds() return time_taken
def capture(interface,database_output_file,redraw_frequency,arp_resolve, dns_resolve,sender_lists,target_lists,color_profile, output_columns,display_false,pcap_output_file,force_sender, *args,**kwargs): dbfile = database_output_file osigint = signal.signal(signal.SIGINT,signal.SIG_IGN) pool = Pool(3) signal.signal(signal.SIGINT, osigint) try: # ============== # START SNIFFING # ============== ''' The sniffer is started in a distinct process because Scapy will block forever when scapy.all.sniff is called. This allows us to interrupt execution of the sniffer by terminating the process. TODO: It may be easier to use threading. Pool methods were fresh to me at the time of original development. ''' ptable = None pcount = 0 # Handle new database file. When verbose, alert user that a new # capture must occur prior to printing results. arp_resolution = ('disabled','enabled')[arp_resolve] dns_resolution = ('disabled','enabled')[dns_resolve] print('\x1b[2J\x1b[H\33[F') print(logo+'\n') print(f'Capture interface: {interface}') print(f'ARP resolution: {arp_resolution}') print(f'DNS resolution: {dns_resolution}') sess = create_db(dbfile) # ====================================== # CREATE AN IP FOR THE CURRENT INTERFACE # ====================================== iface_mac, iface_ips = get_interfaces()[interface] for ip in iface_ips: ip = get_or_create_ip(ip, sess, mac_address=iface_mac) if not Path(dbfile).exists(): print('- Initializing capture\n- This may take time depending '\ 'on network traffic and filter configurations') else: print(f'Requests analyzed: {pcount}\n') ptable = get_output_table( sess, sender_lists=sender_lists, target_lists=target_lists, dns_resolve=dns_resolve, color_profile=color_profile, arp_resolve=arp_resolve, columns=output_columns, display_false=display_false, force_sender=force_sender) print(ptable) # Cache packets that will be written to output file pkts = [] sniff_result = None arp_resolve_result, dns_resolve_result = None, None # Loop eternally while True: # Handle sniff results if sniff_result and sniff_result.ready(): packets = sniff_result.get() sniff_result = None # Capture packets for the output file if pcap_output_file and packets: pkts += packets if packets: pcount += packets.__len__() # Clear the previous table from the screen using # escape sequences screen # https://stackoverflow.com/questions/5290994/remove-and-replace-printed-items/5291044#5291044 if ptable: lcount = ptable.split('\n').__len__()+2 stdout.write('\033[F\033[K'*lcount) ptable = get_output_table( sess, sender_lists=sender_lists, target_lists=target_lists, dns_resolve=dns_resolve, color_profile=color_profile, arp_resolve=arp_resolve, columns=output_columns, display_false=display_false, force_sender=force_sender) print(f'Requests analyzed: {pcount}\n') print(ptable) # Do sniffing elif not sniff_result: sniff_result = pool.apply_async( async_sniff, ( interface, redraw_frequency, sender_lists, target_lists, database_output_file, ) ) # ================== # DNS/ARP RESOLUTION # ================== # Do reverse resolution if dns_resolve: # Reset dns resolution results if not dns_resolve_result or dns_resolve_result.ready(): to_resolve = sess.query(IP) \ .filter(IP.reverse_dns_attempted != True) \ .count() if to_resolve: dns_resolve_result = pool.apply_async( reverse_dns_resolve_ips, (database_output_file,) ) # Do ARP resolution if arp_resolve: if not arp_resolve_result or arp_resolve_result.ready(): to_resolve = sess.query(IP) \ .filter(IP.arp_resolve_attempted != True) \ .count() if to_resolve: arp_resolve_result = pool.apply_async( arp_resolve_ips, (interface, database_output_file,) ) sleep(.2) except KeyboardInterrupt: print('\n- CTRL^C Caught...') sess.close() finally: # =================== # HANDLE OUTPUT FILES # =================== if pcap_output_file: wrpcap(pcap_output_file,pkts) # ===================== # CLOSE CHILD PROCESSES # ===================== try: pool.close() if sniff_result: print('- Waiting for the sniffer process...',end='') sniff_result.wait(5) print('done') if dns_resolve_result: print('- Waiting for the DNS resolver process...',end='') dns_resolve_result.wait(5) print('done') if arp_resolve_result: print('- Waiting for the ARP resolver ocess...',end='') arp_resolve_result.wait(5) print('done') except KeyboardInterrupt: pool.terminate() pool.join()
def evaluate_csv_right(self): """ 评估CSV文件 """ # in_file_name = 'test_400_right' # 测试400 # in_file_name = 'test_1000_right' # 测试1000 # in_file_name = 'random_1w_urls' # 测试1w # in_file = os.path.join(DATA_DIR, 'test_urls_files', in_file_name + ".csv") # in_file_name = "sanghu.zj_question_cut_sampled_jueying_url_5k_1229" # 整页影印 # in_file_name = "dump_write_pure.out" # 纯手写 # in_file_name = "7_train_ori.out" # 整页query # in_file_name = "HW_TRAIN.out" # in_file_name = "biaozhu_fix.check" # in_file_name = "biaozhu_csv_out" # in_file_name = "random_1w_urls" # 普通query # in_file_name = "zjw_url" # 小图 # in_file_name = "xiaotu_labeled_25w_165512" # 小图 in_file_name = "zjw_imgs_20210427_urls" # 小图 in_file = os.path.join(DATA_DIR, 'page_dataset_files', in_file_name + ".txt") # 输入文件 print('[Info] in_file: {}'.format(in_file)) data_lines = read_file(in_file) print('[Info] 样本总量: {}'.format(len(data_lines))) if len(data_lines) == 0: print('[Info] 文件路径错误: {}'.format(in_file)) return # 测试文件 n = 10000 if len(data_lines) > n: random.seed(47) # random.seed(89) random.shuffle(data_lines) # 随机生成 data_lines = data_lines[:n] print('[Info] 样本数量: {}'.format(len(data_lines))) # 测试文件 time_str = get_current_time_str() out_name = 'check_{}.{}.csv'.format(in_file_name, time_str) out_dir = os.path.join(DATA_DIR, "check_dir_20210329") mkdir_if_not_exist(out_dir) out_file = os.path.join(out_dir, out_name) # 筛选文件 # out_dir = os.path.join(DATA_DIR, "xiaotu_dir") # in_file_name = '{}_good.txt'.format(in_file_name) # mkdir_if_not_exist(out_dir) # out_file = os.path.join(out_dir, in_file_name) # write_dir = os.path.join(out_dir, 'write_dir_{}'.format(time_str)) # mkdir_if_not_exist(write_dir) write_dir = None pool = Pool(processes=100) for idx, data_line in enumerate(data_lines): # 方案1 # if idx == 0: # continue # url, r_angle = data_line.split(',') # 方案2 url, r_angle = data_line, 0 # name = url.split('/')[-1].split('.')[0] # file_name_x = in_file_name.split('.')[0] # url = "https://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_rotation/" \ # "datasets/{}_x/{}.jpg".format(file_name_x, name) try: pool.apply_async(OnlineEvaluation.process_thread_right, (idx, url, r_angle, out_file, write_dir)) # OnlineEvaluation.process_thread_right(idx, url, r_angle, out_file, write_dir) # 筛选图像 # pool.apply_async(OnlineEvaluation.process_save_img_url, (idx, url, r_angle, out_file, write_dir)) # OnlineEvaluation.process_save_img_url(idx, url, r_angle, out_file, write_dir) except Exception as e: print('[Info] Error URL: {}'.format(url)) continue # print('[Info] URL: {}'.format(url)) # OnlineEvaluation.process_thread_right(idx, url, r_angle, out_file, write_dir) pool.close() pool.join() print('[Info] 写入文件: {}'.format(out_file))
dftr = pd.DataFrame({'id': ids, 'train': 'train'}) tdftr = pd.DataFrame({'id': ids, 'train': 'test'}) train, test = DataProcess.train_test_between_subject( gdata, pd.concat((dftr, tdftr)), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) DLogger.logger().debug("total points: " + str(get_total_pionts(train))) worker = GQL.get_instance(2, 10, {}) train = DataProcess.merge_data(train) OptML.optimise(worker, output_path, train, test, global_iters=1000, learning_rate=learning_rate) if __name__ == '__main__': if len(sys.argv) == 2: n_proc = int(sys.argv[1]) elif len(sys.argv) == 1: n_proc = 1 else: raise Exception('invalid argument') p = Pool(n_proc) p.map(run_BD, range(len(configs))) p.close() # no more tasks p.join() # wrap up current tasks
def main(args): """Do it all.""" if not os.path.isdir(args.logs): raise Fail("Logs location '%s' is not a directory." % args.logs) builds = gather_builds(args) if args.verbose: print("Lined up %d builds." % len(builds)) # The "configure" step is single-threaded. We can run many at the same # time, even when we're also running a "build" step at the same time. # This means we may run a lot more processes than we have CPUs, but there's # no law against that. There's also I/O time to be covered. configure_pool = Pool() # Builds which have failed the "configure" stage, with their errors. This # queue must never stall, so that we can let results pile up here while the # work continues. configure_fails = Queue(len(builds)) # Waiting list for the "build" stage. It contains Build objects, # terminated by a final None to signify that there are no more builds to be # done. build_queue = JoinableQueue(10) # Builds that have failed the "build" stage. build_fails = Queue(len(builds)) # Waiting list for the "test" stage. It contains Build objects, terminated # by a final None. test_queue = JoinableQueue(10) # The "build" step tries to utilise all CPUs, and it may use a fair bit of # memory. Run only one of these at a time, in a single worker process. build_worker = Process( target=service_builds, args=(build_queue, build_fails, test_queue)) build_worker.start() # Builds that have failed the "test" stage. test_fails = Queue(len(builds)) # Completed builds. This must never stall. done_queue = JoinableQueue(len(builds)) # The "test" step can not run concurrently (yet). So, run tests serially # in a single worker process. It takes its jobs directly from the "build" # worker. test_worker = Process( target=service_tests, args=(test_queue, test_fails, done_queue)) test_worker.start() # Feed all builds into the "configure" pool. Each build which passes this # stage goes into the "build" queue. for build in builds: configure_pool.apply_async( build.do_configure, callback=partial(enqueue, build_queue, build), error_callback=partial(enqueue_error, configure_fails, build)) if args.verbose: print("All jobs are underway.") configure_pool.close() configure_pool.join() # TODO: Async reporting for faster feedback. configure_fail_count = report_failures(configure_fails, "CONFIGURE FAIL") if args.verbose: print("Configure stage done.") # Mark the end of the build queue for the build worker. build_queue.put(None) build_worker.join() # TODO: Async reporting for faster feedback. build_fail_count = report_failures(build_fails, "BUILD FAIL") if args.verbose: print("Build step done.") # Mark the end of the test queue for the test worker. test_queue.put(None) test_worker.join() # TODO: Async reporting for faster feedback. test_fail_count = report_failures(test_fails, "TEST FAIL") if args.verbose: print("Test step done.") # All done. Clean up. for build in builds: build.clean_up() ok_count = count_entries(done_queue) if ok_count == len(builds): print("All tests OK.") else: print( "Failures during configure: %d - build: %d - test: %d. OK: %d." % ( configure_fail_count, build_fail_count, test_fail_count, ok_count, ))
def evaluate_regions(folder_predicted: str, folder_gt: str, regions: dict, processes=default_num_threads): region_names = list(regions.keys()) files_in_pred = subfiles(folder_predicted, suffix='.nii.gz', join=False) files_in_gt = subfiles(folder_gt, suffix='.nii.gz', join=False) have_no_gt = [i for i in files_in_pred if i not in files_in_gt] assert len( have_no_gt ) == 0, "Some files in folder_predicted have not ground truth in folder_gt" have_no_pred = [i for i in files_in_gt if i not in files_in_pred] if len(have_no_pred) > 0: print( "WARNING! Some files in folder_gt were not predicted (not present in folder_predicted)!" ) files_in_gt.sort() files_in_pred.sort() # run for all cases full_filenames_gt = [folder_gt + "/" + i for i in files_in_pred] full_filenames_pred = [folder_predicted + "/" + i for i in files_in_pred] p = Pool(processes) res = p.starmap( evaluate_case, zip(full_filenames_pred, full_filenames_gt, [list(regions.values())] * len(files_in_gt))) p.close() p.join() all_results = {r: [] for r in region_names} with open(folder_predicted + "/" + 'summary.csv', 'w') as f: f.write("casename") for r in region_names: f.write(",%s" % r) f.write("\n") for i in range(len(files_in_pred)): f.write(files_in_pred[i][:-7]) result_here = res[i] for k, r in enumerate(region_names): dc = result_here[k] f.write(",%02.4f" % dc) all_results[r].append(dc) f.write("\n") f.write('mean') for r in region_names: f.write(",%02.4f" % np.nanmean(all_results[r])) f.write("\n") f.write('median') for r in region_names: f.write(",%02.4f" % np.nanmedian(all_results[r])) f.write("\n") f.write('mean (nan is 1)') for r in region_names: tmp = np.array(all_results[r]) tmp[np.isnan(tmp)] = 1 f.write(",%02.4f" % np.mean(tmp)) f.write("\n") f.write('median (nan is 1)') for r in region_names: tmp = np.array(all_results[r]) tmp[np.isnan(tmp)] = 1 f.write(",%02.4f" % np.median(tmp)) f.write("\n")
def depth_first(root, n, tofind, best, top=False, master=None, lock=None): # if master is not None: # # print(master, lock) # with lock: # if len(master.get()) < len(master.get()): # best = master.get() # # lock.release() # # print("released") # If tofind is empty if not tofind: return root # If longer than the current best (assuming all unfound patterns can be # included with just one additional character each, which is the best case) elif (len(root) + len(tofind)) >= len(best): return None # Generate potential branches based on the current root potential = try_add(root, n, tofind, best) # Potential branches collected, explore each one new = best # If more than one branch and has not previously done so, # do multiprocessing if top and (len(potential) > 1): # print("Multi", potential) for p in potential: p.n = n p.best = best args = [pickle.dumps(p) for p in potential] # Start processes and get results # lock = Lock() # master = Master(lock, best) with Manager() as manager: master = manager.Value(str, best) lock = manager.RLock() func = partial(depth_wrapper, master=master, lock=lock) pool = Pool(processes=min(len(args), os.cpu_count())) async_result = pool.map_async(func, args) pool.close() pool.join() results = async_result.get() # Find shortest for r in results: if r is None: continue # Else, compare result to current best else: # logger.info(" # Branches: {}".format(len(potential))) if len(r) < len(new): new = r # logger.info(" New Best: {}\n".format(len(new))) else: for p in potential: r = depth_first(p.root, n, p.tofind, new, top=top, master=master, lock=lock) # If None, branch is discarded as not being a better solution if r is None: continue # Else, compare result to current best else: # print(" # Branches: {}".format(len(potential))) # print(" master: {}".format(master)) # print(" Seed Level: {}".format(len(p.root) - n)) # logger.info(" # Branches: {}".format(len(potential))) # logger.info(" Seed Level: {}".format(len(p.root) - n)) if len(r) < len(new): new = r if master is not None: # print(master, lock) with lock: if len(new) < len(master.get()): master.set(new) else: new = master.get() # lock.release() # print("released") # logger.info(" New Best: {}\n".format(len(new))) # print(" New Best: {}\n".format(len(new))) # If a better solution was found, return it return new if len(new) < len(best) else None
class Sampler(object): """ ABC population monte carlo sampler :param N: number of particles :param Y: observed data set :param postfn: model function (a callable), which creates a new dataset x for a given theta :param dist: distance function rho(X, Y) (a callable) :param threads: (optional) number of threads. If >1 and no pool is given <threads> multiprocesses will be started :param pool: (optional) a pool instance which has a <map> function """ particle_proposal_cls = ParticleProposal particle_proposal_kwargs = {} def __init__(self, N, Y, postfn, dist, threads=1, pool=None): self.N = N self.Y = Y self.postfn = postfn self.dist = dist self._random = np.random.mtrand.RandomState() if pool is not None: self.pool = pool self.mapFunc = self.pool.map elif threads == 1: self.mapFunc = map else: self.pool = Pool(threads) self.mapFunc = self.pool.map def sample(self, prior, eps_proposal, pool=None): """ Launches the sampling process. Yields the intermediate results per iteration. :param prior: instance of a prior definition (or an other callable) see :py:class:`sampler.GaussianPrior` :param eps_proposal: an instance of a threshold proposal (or an other callable) see :py:class:`sampler.ConstEps` :param pool: (optional) a PoolSpec instance,if not None the initial rejection sampling will be skipped and the pool is used for the further sampling :yields pool: yields a namedtuple representing the values of one iteration """ if pool is None: eps = eps_proposal.next() wrapper = _RejectionSamplingWrapper(self, eps, prior) res = list(self.mapFunc(wrapper, self._random.randint(0, np.iinfo(np.uint32).max, self.N))) thetas = np.array([theta for (theta, _, _) in res]) dists = np.array([dist for (_, dist, _) in res]) cnts = np.sum([cnt for (_, _, cnt) in res]) ws = np.ones(self.N) / self.N pool = PoolSpec(0, eps, self.N/cnts, thetas, dists, ws) yield pool for t, eps in enumerate(eps_proposal, pool.t + 1): particleProposal = self.particle_proposal_cls(self, eps, pool, self.particle_proposal_kwargs) res = list(self.mapFunc(particleProposal, self._random.randint(0, np.iinfo(np.uint32).max, self.N))) thetas = np.array([theta for (theta, _, _) in res]) dists = np.array([dist for (_, dist, _) in res]) cnts = np.sum([cnt for (_, _, cnt) in res]) sigma = 2 * weighted_cov(pool.thetas, pool.ws) wrapper = _WeightWrapper(prior, sigma, pool.ws, pool.thetas) wt = np.array(list(self.mapFunc(wrapper, thetas))) ws = wt/np.sum(wt) pool = PoolSpec(t, eps, self.N/cnts, thetas, dists, ws) yield pool def close(self): """ Tries to close the pool (avoid hanging threads) """ if hasattr(self, "pool") and self.pool is not None: try: self.pool.close() except: pass
cursor.rollback() cursor.close() if __name__ == '__main__': #开启进程,与逻辑核保持一致 connect_db = connect_db() filepath = r'D:\filename' table = 'table_name' t1 = time.time() pro_num = 10 #进程数 pool = Pool(processes=pro_num) job_result = [] #遍历文件夹读取所有文件 for file in os.listdir(filepath): filename = filepath + '\\' + file res = pool.apply_async(read_data, (filename, )) job_result.append(res) pool.close() #关闭进程池 pool.join() #合并所有读取的文件 get_result = pd.DataFrame() for tmp in job_result: get_result = get_result.append(tmp.get()) t2 = time.time() insert_data(connect_db, get_result, table) print('It took a total of %0.2f seconds.' % (t2 - t1))
else: print('Already Downloaded', file_path) except requests.ConnectionError: print('Failed to Save Image,item %s' % item) def main(offset): json=get_page(offset) for item in get_images(json): print(item) save_image(item) GROUP_START=1 GROUP_END=10 if __name__=='__main__': pool=Pool() groups=([x*20 for x in range(GROUP_START,GROUP_END+1)]) pool.map(main,groups) pool.close() pool.join() print('suc4')
df.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv" ) df2 = pd.DataFrame(y_test) df2.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv" ) print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n") logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>") return [test_accuracy, test_f1, test_auc] if __name__ == '__main__': pool = Pool(int(os.getenv('N_PROC', os.cpu_count()))) futures = [pool.apply_async(func=svm, args=[i]) for i in range(1, 11)] pool.close() # 关闭pool,使其不在接受新的(主进程)任务 average_acc_test, average_f1_test, average_auc_test = [], [], [] for item in futures: result = item.get() average_acc_test.append(result[0]) average_f1_test.append(result[1]) average_auc_test.append(result[2]) print( f"Vggish Classification Average Results: Acc.= {mean(average_acc_test)}, F1 = {mean(average_f1_test)}, AUC = {mean(average_auc_test)}" ) print( f"average_acc_test = {average_acc_test},/n average_f1_test={average_f1_test},/n average_auc_test = {average_auc_test}" ) logger.info( f"Vggish Classification Average Results: Acc.= {mean(average_acc_test)}, F1 = {mean(average_f1_test)}, AUC = {mean(average_auc_test)}" )
local_image_url = item.get('image') new_image_url = local_image_url.replace('list', 'large') r = requests.get('http:' + new_image_url) if r.status_code == 200: file_path = img_path + os.path.sep + '{0}.{1}'.format( md5(r.content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(r.content) def saveToMongo(item): if db[MONGO_TABLE].insert(item): print('储存到MONGODB成功', item) return False def main(offset): json = getPage(offset) for item in getImage(json): saveImage(item) saveToMongo(item) if __name__ == '__main__': pool = Pool() groups = [x * 20 for x in range(2)] #爬取五页 pool.map(main, groups) pool.close() #关闭进程池(pool),使其不在接受新的任务 pool.join() #主进程阻塞等待子进程的退出
import random from multiprocessing.pool import Pool from time import sleep, time import os def run(name): print("%s子进程开始,进程ID:%d" % (name, os.getpid())) start = time() sleep(random.choice([1, 2, 3, 4])) end = time() print("%s子进程结束,进程ID:%d。耗时0.2%f" % (name, os.getpid(), end - start)) if __name__ == "__main__": print("父进程开始") # 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数 p = Pool(8) for i in range(10): # 创建进程,放入进程池统一管理 p.apply_async(run, args=(i, )) # 如果我们用的是进程池,在调用join()之前必须要先close(),并且在close()之后不能再继续往进程池添加新的进程 p.close() # 进程池对象调用join,会等待进程吃中所有的子进程结束完毕再去结束父进程 p.join() print("父进程结束。")
def validate(self, do_mirroring: bool = True, use_sliding_window: bool = True, step_size: float = 0.5, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True, validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False, segmentation_export_kwargs: dict = None): current_mode = self.network.training self.network.eval() assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)" if self.dataset_val is None: self.load_dataset() self.do_split() if segmentation_export_kwargs is None: if 'segmentation_export_params' in self.plans.keys(): force_separate_z = self.plans['segmentation_export_params'][ 'force_separate_z'] interpolation_order = self.plans['segmentation_export_params'][ 'interpolation_order'] interpolation_order_z = self.plans[ 'segmentation_export_params']['interpolation_order_z'] else: force_separate_z = None interpolation_order = 1 interpolation_order_z = 0 else: force_separate_z = segmentation_export_kwargs['force_separate_z'] interpolation_order = segmentation_export_kwargs[ 'interpolation_order'] interpolation_order_z = segmentation_export_kwargs[ 'interpolation_order_z'] output_folder = join(self.output_folder, validation_folder_name) maybe_mkdir_p(output_folder) if do_mirroring: mirror_axes = self.data_aug_params['mirror_axes'] else: mirror_axes = () pred_gt_tuples = [] export_pool = Pool(2) results = [] transpose_backward = self.plans.get('transpose_backward') for k in self.dataset_val.keys(): properties = load_pickle(self.dataset[k]['properties_file']) data = np.load(self.dataset[k]['data_file'])['data'] # concat segmentation of previous step seg_from_prev_stage = np.load( join(self.folder_with_segs_from_prev_stage, k + "_segFromPrevStage.npz"))['data'][None] print(data.shape) data[-1][data[-1] == -1] = 0 data_for_net = np.concatenate( (data[:-1], to_one_hot(seg_from_prev_stage[0], range(1, self.num_classes)))) softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax( data_for_net, do_mirroring, mirror_axes, use_sliding_window, step_size, use_gaussian, all_in_gpu=all_in_gpu)[1] if transpose_backward is not None: transpose_backward = self.plans.get('transpose_backward') softmax_pred = softmax_pred.transpose( [0] + [i + 1 for i in transpose_backward]) fname = properties['list_of_data_files'][0].split("/")[-1][:-12] if save_softmax: softmax_fname = join(output_folder, fname + ".npz") else: softmax_fname = None """There is a problem with python process communication that prevents us from communicating obejcts larger than 2 GB between processes (basically when the length of the pickle string that will be sent is communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either filename or np.ndarray and will handle this automatically""" if np.prod(softmax_pred.shape) > (2e9 / 4 * 0.85): # *0.85 just to be save np.save(fname + ".npy", softmax_pred) softmax_pred = fname + ".npy" results.append( export_pool.starmap_async( save_segmentation_nifti_from_softmax, ((softmax_pred, join(output_folder, fname + ".nii.gz"), properties, interpolation_order, self.regions_class_order, None, None, softmax_fname, None, force_separate_z, interpolation_order_z), ))) pred_gt_tuples.append([ join(output_folder, fname + ".nii.gz"), join(self.gt_niftis_folder, fname + ".nii.gz") ]) _ = [i.get() for i in results] task = self.dataset_directory.split("/")[-1] job_name = self.experiment_name _ = aggregate_scores(pred_gt_tuples, labels=list(range(self.num_classes)), json_output_file=join(output_folder, "summary.json"), json_name=job_name, json_author="Fabian", json_description="", json_task=task) # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything # except the largest connected component for each class. To see if this improves results, we do this for all # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will # have this applied during inference as well self.print_to_log_file("determining postprocessing") determine_postprocessing(self.output_folder, self.gt_niftis_folder, validation_folder_name, final_subf_name=validation_folder_name + "_postprocessed", debug=debug) # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed" # They are always in that folder, even if no postprocessing as applied! # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to # be used later gt_nifti_folder = join(self.output_folder_base, "gt_niftis") maybe_mkdir_p(gt_nifti_folder) for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"): success = False attempts = 0 while not success and attempts < 10: try: shutil.copy(f, gt_nifti_folder) success = True except OSError: attempts += 1 sleep(1) self.network.train(current_mode) export_pool.close() export_pool.join()
def encodeHVLAD_(images, encoder, dmd_options, level): descrs = [] pool = Pool(processes=8) encoder = encoder.tolist() centers = encoder['centers'] features = [ pool.apply_async(computeSDMD, args=(img, dmd_options, level)) for img in images ] pool.close() pool.join() # print(centers) # vars = encoder['vars'] # skews = encoder['skews'] print('转到encodeHvlad') for feature in features: feature = feature.get().T new_features = np.zeros((feature.shape[0], feature.shape[1]), dtype=np.float32) new_features[:, :] = feature[:, :] predicted_labels = kmeans_quantize(data=new_features, centers=centers) n_cluster = centers.shape[0] [n_patch, n_feature] = new_features.shape Vm = np.zeros([n_cluster, n_feature], dtype=np.float32) Vc = np.zeros([n_cluster, n_feature], dtype=np.float32) Vs = np.zeros([n_cluster, n_feature], dtype=np.float32) for i in range(n_cluster): Ni = np.sum(predicted_labels == i) if Ni > 0: i_features = new_features[predicted_labels == i, :] mi = np.mean(i_features, axis=0) Vm[i] = Ni * (mi - centers[i]) Vc[i] = (1 / Ni) * np.sum( (i_features - mi)**2, axis=0) - (1 / Ni) * np.sum( (i_features - centers[i])**2, axis=0) Vs[i] = ((1 / Ni) * (np.sum( (i_features - mi)**3, axis=0))) / np.maximum( ((1 / Ni) * np.sum( (i_features - mi)**2, axis=0))**1.5, 1e-12) - ( (1 / Ni) * (np.sum( (i_features - centers[i])**3, axis=0))) / np.maximum(((1 / Ni) * np.sum( (i_features - centers[i])**2, axis=0)) **1.5, 1e-12) # power normalization, also called square-rooting normalization Vm = np.sign(Vm) * np.sqrt(np.abs(Vm)) Vc = np.sign(Vc) * np.sqrt(np.abs(Vc)) Vs = np.sign(Vs) * np.sqrt(np.abs(Vs)) # # L2 normalization # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12) # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12) # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12) V_all = np.vstack((Vm, Vc, Vc)).flatten()[None, :] descrs = V_all if len(descrs) == 0 else np.concatenate( (descrs, V_all), axis=0) return descrs.astype(np.float32)
class PyRAMmp(): ''' The PyRAMmp class sets up and runs a multiprocessing pool to enable parallel PyRAM model runs. ''' def __init__(self, processes=None, maxtasksperchild=None): ''' Initialise the pool and variable lists. processes and maxtasksperchild are passed to the pool. ''' self.pool = Pool(processes=processes, maxtasksperchild=maxtasksperchild) self.results = [] # Results from PyRAM.run() self._outputs = [ ] # New outputs from PyRAM.run() for transfer to self.results self._waiting = [] # Waiting runs self._num_waiting = 0 # Number of waiting runs self._num_active = 0 # Number of active runs self._sleep_time = 1e-2 # Minimum sleep time between adding runs to pool self._new = True # Flag to indicate ready for new set of runs def submit_runs(self, runs): ''' Submit new runs to the pool as resources become available runs is a list of PyRAM input tuples (args, kwargs) ''' # Add to waiting list for run in runs: self._waiting.append(run) self._num_waiting = len(self._waiting) # Check how many active runs have finished for _ in range(len(self._outputs)): run = self._outputs.pop(0) self.results.append(run) self._num_active -= 1 num_start = self.pool._processes - self._num_active num_start = min(num_start, self._num_waiting) # Start new runs if processes are free for _ in range(num_start): run = self._waiting.pop(0) self.pool.apply_async(run_pyram, args=(run, ), callback=self._get_output) self._num_active += 1 if self._new: self._new = False self._wait() def _wait(self): ''' Wait for all submitted runs to complete. ''' while self._num_active > 0: self.submit_runs([]) sleep(self._sleep_time) self._new = True def close(self): ''' Close the pool and wait for all processes to finish. ''' self.pool.close() self.pool.join() def _get_output(self, output): ''' Get a PyRAM output. ''' self._outputs.append(output) def __del__(self): self.close()
def authorate(arguments): """Main function which delegates to fabric tasks.""" global engine engine = create_engine('sqlite:///' + arguments['--db']) create_db(engine) global VERBOSE VERBOSE = arguments['--verbose'] multi_thread = not arguments['--one'] if arguments['-C']: classify.classifiers_dir = arguments['-C'] # Assume successful return value ret = 0 if arguments['load']: # Load in words and word counts from file session = get_session(engine) if len(session.query(Word_Count).all()) == 0: subprocess.call('sqlite3 ' + arguments['--db'] + ' < import_words.sql', shell=True) prefix = arguments['--prefix'] if os.path.exists(prefix): # Determine how many snippets to get per path. snippets_count = arguments['<snippets-per-path>'] if not snippets_count: snippets_count = DEFAULT_SNIPPETS_COUNT pool = Pool(cpu_count() if multi_thread else 1) with open(arguments['<paths-file>'], 'r') as paths_file: paths = paths_file.readlines() for path in paths: res = load_path(pool, path.rstrip(), prefix=prefix, multi_thread=multi_thread) if not res: ret = 3 # Join the pool pool.close() pool.join() else: display_error( "The given prefix does not exist: {path}".format(path=prefix)) ret = 2 elif arguments['process']: # Cleanup the classifier dir classify.clean_classifier_dir() # Get and scale data from snippets session = get_session(engine) snippets = session.query(Book, Snippet).join(Snippet).all() data = [text_to_vector(snip.text, session) for _, snip in snippets] scaler = classify.create_and_save_scaler(data) scaled_data = scaler.transform(data) targets = [book.path_id for book, _ in snippets] # Train the classifiers for (Cls, kwargs) in classify.classifier_types: with warnings.catch_warnings(): warnings.simplefilter("ignore") classifier = Cls(**kwargs) classifier.fit(scaled_data, targets) classify.save_classifier(classifier) elif arguments['classify']: snip_file = arguments['<snippet-file>'] input_files = [snip_file if snip_file else '-'] classify.classify_all( engine, " ".join([ unicode(line.rstrip(), errors='ignore') for line in fileinput.input(input_files) ])) elif arguments['test']: session = get_session(engine) snippets = session.query(Book, Snippet).join(Snippet).all() if VERBOSE: print("Converting raw data to vectors. . .") data = [text_to_vector(snip.text, session) for _, snip in snippets] targets = [book.path_id for book, _ in snippets] classify.test_all(engine, data, targets) else: display_error("No subcommand given.") ret = 1 return ret
def ensemble(training_output_folder1, training_output_folder2, output_folder, task, validation_folder, folds, allow_ensembling: bool = True): print("\nEnsembling folders\n", training_output_folder1, "\n", training_output_folder2) output_folder_base = output_folder output_folder = join(output_folder_base, "ensembled_raw") # only_keep_largest_connected_component is the same for all stages dataset_directory = join(preprocessing_output_dir, task) plans = load_pickle(join(training_output_folder1, "plans.pkl")) # we need this only for the labels files1 = [] files2 = [] property_files = [] out_files = [] gt_segmentations = [] folder_with_gt_segs = join(dataset_directory, "gt_segmentations") # in the correct shape and we need the original geometry to restore the niftis for f in folds: validation_folder_net1 = join(training_output_folder1, "fold_%d" % f, validation_folder) validation_folder_net2 = join(training_output_folder2, "fold_%d" % f, validation_folder) if not isdir(validation_folder_net1): raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1) if not isdir(validation_folder_net2): raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2) # we need to ensure the validation was successful. We can verify this via the presence of the summary.json file if not isfile(join(validation_folder_net1, 'summary.json')): raise AssertionError("Validation directory incomplete: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1) if not isfile(join(validation_folder_net2, 'summary.json')): raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2) patient_identifiers1_npz = [i[:-4] for i in subfiles(validation_folder_net1, False, None, 'npz', True)] patient_identifiers2_npz = [i[:-4] for i in subfiles(validation_folder_net2, False, None, 'npz', True)] # we don't do postprocessing anymore so there should not be any of that noPostProcess patient_identifiers1_nii = [i[:-7] for i in subfiles(validation_folder_net1, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')] patient_identifiers2_nii = [i[:-7] for i in subfiles(validation_folder_net2, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')] if not all([i in patient_identifiers1_npz for i in patient_identifiers1_nii]): raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net1)) if not all([i in patient_identifiers2_npz for i in patient_identifiers2_nii]): raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net2)) patient_identifiers1_npz.sort() patient_identifiers2_npz.sort() assert all([i == j for i, j in zip(patient_identifiers1_npz, patient_identifiers2_npz)]), "npz filenames do not match. This should not happen." os.makedirs(output_folder, exist_ok=True) for p in patient_identifiers1_npz: files1.append(join(validation_folder_net1, p + '.npz')) files2.append(join(validation_folder_net2, p + '.npz')) property_files.append(join(validation_folder_net1, p) + ".pkl") out_files.append(join(output_folder, p + ".nii.gz")) gt_segmentations.append(join(folder_with_gt_segs, p + ".nii.gz")) p = Pool(default_num_threads) p.map(merge, zip(files1, files2, property_files, out_files)) p.close() p.join() if not isfile(join(output_folder, "summary.json")) and len(out_files) > 0: aggregate_scores(tuple(zip(out_files, gt_segmentations)), labels=plans['all_classes'], json_output_file=join(output_folder, "summary.json"), json_task=task, json_name=task + "__" + os.path.basename(output_folder_base), num_threads=default_num_threads) if allow_ensembling and not isfile(join(output_folder_base, "postprocessing.json")): # now lets also look at postprocessing. We cannot just take what we determined in cross-validation and apply it # here because things may have changed and may also be too inconsistent between the two networks determine_postprocessing(output_folder_base, folder_with_gt_segs, "ensembled_raw", "temp", "ensembled_postprocessed", default_num_threads, dice_threshold=0) out_dir_all_json = join(network_training_output_dir, "summary_jsons") json_out = load_json(join(output_folder_base, "ensembled_postprocessed", "summary.json")) json_out["experiment_name"] = os.path.basename(output_folder_base) save_json(json_out, join(output_folder_base, "ensembled_postprocessed", "summary.json")) os.makedirs(out_dir_all_json, exist_ok=True) shutil.copy(join(output_folder_base, "ensembled_postprocessed", "summary.json"), join(out_dir_all_json, "%s__%s.json" % (task, os.path.basename(output_folder_base))))
# 如果还没满,就创建一个新的进程来执行该请求,否则,该请求就会等待,直到池中有进程结束,才会创建新的进程 import os import time from multiprocessing.pool import Pool from random import random def task(task_name): print("开始我的新任务啦....", task_name, os.getpid()) starttime = time.time() time.sleep(random() * 3) endtime = time.time() #print("我的任务--{}--完成啦...耗时{},进程{}".format(task_name, endtime-starttime, os.getpid())) return "我的任务--{}--完成啦...耗时{},进程{}".format(task_name, endtime - starttime, os.getpid()) def callback_func(n): print(n) if __name__ == "__main__": # 进程池 pool = Pool(5) tasks = ["听音乐", "吃饭", "打游戏", "看孩子", "做饭", "跑步", "学习", "打架", "听音乐", "吃饭", "打游戏", "看孩子", "做饭", "跑步", "学习", "打架"] for t in tasks: pool.apply_async(task, args=(t,), callback = callback_func) # 异步方式,非阻塞 pool.close() # 进程池添加结束 pool.join() # 使主进程阻塞
os.mkdir(item.get('title')) try: response = requests.get(item.get('image')) # 请求图片的网址链接 if response.status_code == 200: file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg') # 0表示文件夹名,1表示文件名,用图片内容的md5值,2表示jpg格式 if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(response.content) else: print("Already downloaded",file_path) except requests.ConnectionError: print('Failed to Save Image') from multiprocessing.pool import Pool # python进程池 def main(offset): json = get_page(offset) for item in get_images(json): print(item) save_image(item) GROUP_START = 0 GROUP_END = 20 if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END+1)]) pool.map(main,groups) pool.close() # 关闭进程池,表示不能在往进程池中添加进程 pool.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用
def encodeHVLAD(images, encoder, dmd_options): l_descrs, m_descrs, s_descrs, all_descrs = [], [], [], [] pool = Pool(processes=4) l_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 0)) for img in images]#零层高斯金字塔出现的图片特征 m_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 1)) for img in images]#1层高斯金字塔出现的图片特征 s_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 2)) for img in images]#2层高斯金字塔出现的图片特征 pool.close() pool.join() centers = encoder['centers'] # vars = encoder['vars'] # skews = encoder['skews'] #以下的循环,第一步:循环每一次的变量,就是从上面通过computeSDMD获得的金字塔一层中,一个图片的SDMD特征(80,8649),把它转置命名为features,和newfeatures # 第二步:为每个特征和聚类中心索引上,就有了predicted_labels(1,8649) # 把每个类中心索引的特征值单拿出来,计算Vm,特征值的平均值和聚类中心相减再乘上索引到当前聚类中心的特征个数,计算Vc, # Vc就是当前聚类中心特征的方差,减去以聚类中心作为均值的方差,最后按坐标累加再除以特征个数,每个聚类中心都会形成(1,80),然后经过 # 128循环,形成(128,80) # V_all就是把Vm和Vc拼接在一起,然后再转化成一维(1,128*80) # 最后经过40次循环,形成(40,128*80)这个就是一层高斯金字塔的encode # 接下来的另外两个循环也是一样的,只不过就是再高斯金字塔的更高层而已,计算完l,s,m也就是1.2.3层的vlad编码,再把这三个放在一起取平均值 # 得到的decris print('进入HVLAD') for features in l_features: features = features.get().T#图片特征就是(80,8649),经过转置,图片特征是(8649,80) new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32)#全零矩阵(8649,80) new_features[:, :] = features[:, :]#把features的值赋给new_features predicted_labels = kmeans_quantize(data=new_features, centers=centers)#就是聚类为每个聚类中心分配特征,或者说为每一行的特征找索引 n_cluster = centers.shape[0]#center(128,80) [n_patch, n_feature] = features.shape #以上是对图片的特征进行与聚类中心的索引 Vm = np.zeros([n_cluster, n_feature], dtype=np.float32)#Vm(128,80) Vc = np.zeros([n_cluster, n_feature], dtype=np.float32)#vc (128,80) # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32) for i in range(n_cluster): Ni = np.sum(predicted_labels == i) if Ni > 0: i_features = features[predicted_labels == i, :] #挑选相应的列,( Ni,80) mi = np.mean(i_features, axis=0)#mi (1,80) Vm[i] = Ni * (mi - centers[i])#特征与聚类中心相减然后再乘上使用这个聚类中心的索引到聚类中心的个数 Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum( (i_features - centers[i]) ** 2, axis=0)#前面的np.sum是先计算ifeatures每一行的值减去均值,然后平方累加,好像就是求方差,第二个就是把聚类中心当作均值来求方差 #上面应该是不同均值计算的方差均值相减 # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - ( # (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12) # # power normalization, also called square-rooting normalization Vm = np.sign(Vm) * np.sqrt(np.abs(Vm)) Vc = np.sign(Vc) * np.sqrt(np.abs(Vc)) # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs)) # # L2 normalization # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12) # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12) # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12) # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :] V_all = np.vstack((Vm, Vc)).flatten()[None, :]#拼接到一起,先是合并到一起(128,160),然后转成一维 l_descrs = V_all if len(l_descrs) == 0 else np.concatenate((l_descrs, V_all), axis=0) for features in m_features: features = features.get().T new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32) new_features[:, :] = features[:, :] predicted_labels = kmeans_quantize(data=new_features, centers=centers) n_cluster = centers.shape[0] [n_patch, n_feature] = features.shape Vm = np.zeros([n_cluster, n_feature], dtype=np.float32) Vc = np.zeros([n_cluster, n_feature], dtype=np.float32) # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32) for i in range(n_cluster): Ni = np.sum(predicted_labels == i) if Ni > 0: i_features = features[predicted_labels == i, :] mi = np.mean(i_features, axis=0) Vm[i] = Ni * (mi - centers[i]) Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum( (i_features - centers[i]) ** 2, axis=0) # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - ( # (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12) # power normalization, also called square-rooting normalization Vm = np.sign(Vm) * np.sqrt(np.abs(Vm)) Vc = np.sign(Vc) * np.sqrt(np.abs(Vc)) # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs)) # # L2 normalization # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12) # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12) # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12) # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :] V_all = np.vstack((Vm, Vc)).flatten()[None, :] m_descrs = V_all if len(m_descrs) == 0 else np.concatenate((m_descrs, V_all), axis=0) for features in s_features: features = features.get().T new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32) new_features[:, :] = features[:, :] predicted_labels = kmeans_quantize(data=new_features, centers=centers) n_cluster = centers.shape[0] [n_patch, n_feature] = features.shape Vm = np.zeros([n_cluster, n_feature], dtype=np.float32) Vc = np.zeros([n_cluster, n_feature], dtype=np.float32) # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32) for i in range(n_cluster): Ni = np.sum(predicted_labels == i) if Ni > 0: i_features = features[predicted_labels == i, :] mi = np.mean(i_features, axis=0) Vm[i] = Ni * (mi - centers[i]) Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum( (i_features - centers[i]) ** 2, axis=0) # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - ( # (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum( # ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12) # power normalization, also called square-rooting normalization Vm = np.sign(Vm) * np.sqrt(np.abs(Vm)) Vc = np.sign(Vc) * np.sqrt(np.abs(Vc)) # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs)) # # L2 normalization # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12) # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12) # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12) # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :] V_all = np.vstack((Vm, Vc)).flatten()[None, :] s_descrs = V_all if len(s_descrs) == 0 else np.concatenate((s_descrs, V_all), axis=0) descrs = (l_descrs + m_descrs + s_descrs)/3 return descrs.astype(np.float32)
def mp_plantation_preparation(gadm_index_shp, planted_index_shp): os.chdir(cn.docker_base_dir) # ## Not actually using this but leaving it here in case I want to add this functionality eventually. This # # was to allow users to run plantations for a select (contiguous) area rather than for the whole planet. # # List of bounding box coordinates # bound_list = args.bounding_box # # Checks if bounding box coordinates are in multiples of 10 (10 degree tiles). If they're not, the script stops. # for bound in bound_list: # if bound%10: # uu.exception_log(bound, 'not a multiple of 10. Please make bounding box coordinates are multiples of 10.') # Checks the validity of the two arguments. If either one is invalid, the script ends. if (gadm_index_path not in cn.gadm_plant_1x1_index_dir or planted_index_path not in cn.gadm_plant_1x1_index_dir): uu.exception_log( 'Invalid inputs. Please provide None or s3 shapefile locations for both arguments.' ) # List of all possible 10x10 Hansen tiles except for those at very extreme latitudes (not just WHRC biomass tiles) total_tile_list = uu.tile_list_s3(cn.pixel_area_dir) uu.print_log("Number of possible 10x10 tiles to evaluate:", len(total_tile_list)) # Removes the latitude bands that don't have any planted forests in them according to Liz Goldman. # i.e., Liz Goldman said by Slack on 1/2/19 that the nothernmost planted forest is 69.5146 and the southernmost is -46.938968. # This creates a more focused list of 10x10 tiles to iterate through (removes ones that definitely don't have planted forest). # NOTE: If the planted forest gdb is updated, the list of latitudes to exclude below may need to be changed to not exclude certain latitude bands. planted_lat_tile_list = [ tile for tile in total_tile_list if '90N' not in tile ] planted_lat_tile_list = [ tile for tile in planted_lat_tile_list if '80N' not in tile ] planted_lat_tile_list = [ tile for tile in planted_lat_tile_list if '50S' not in tile ] planted_lat_tile_list = [ tile for tile in planted_lat_tile_list if '60S' not in tile ] planted_lat_tile_list = [ tile for tile in planted_lat_tile_list if '70S' not in tile ] planted_lat_tile_list = [ tile for tile in planted_lat_tile_list if '80S' not in tile ] # planted_lat_tile_list = ['10N_080W'] uu.print_log(planted_lat_tile_list) uu.print_log( "Number of 10x10 tiles to evaluate after extreme latitudes have been removed:", len(planted_lat_tile_list)) # If a planted forest extent 1x1 tile index shapefile isn't supplied if 'None' in args.planted_tile_index: ### Entry point 1: # If no shapefile of 1x1 tiles for countries with planted forests is supplied, 1x1 tiles of country extents will be created. # This runs the process from the very beginning and will take a few days. if 'None' in args.gadm_tile_index: uu.print_log( "No GADM 1x1 tile index shapefile provided. Creating 1x1 planted forest country tiles from scratch..." ) # Downloads and unzips the GADM shapefile, which will be used to create 1x1 tiles of land areas uu.s3_file_download(cn.gadm_path, cn.docker_base_dir) cmd = ['unzip', cn.gadm_zip] # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: uu.log_subprocess_output(process.stdout) # Creates a new GADM shapefile with just the countries that have planted forests in them. # This limits creation of 1x1 rasters of land area on the countries that have planted forests rather than on all countries. # NOTE: If the planted forest gdb is updated and has new countries added to it, the planted forest country list # in constants_and_names.py must be updated, too. uu.print_log( "Creating shapefile of countries with planted forests...") os.system( '''ogr2ogr -sql "SELECT * FROM gadm_3_6_adm2_final WHERE iso IN ({0})" {1} gadm_3_6_adm2_final.shp''' .format(str(cn.plantation_countries)[1:-1], cn.gadm_iso)) # Creates 1x1 degree tiles of countries that have planted forests in them. # I think this can handle using 50 processors because it's not trying to upload files to s3 and the tiles are small. # This takes several days to run because it iterates through at least 250 10x10 tiles. # For multiprocessor use. processes = 50 uu.print_log('Rasterize GADM 1x1 max processors=', processes) pool = Pool(processes) pool.map(plantation_preparation.rasterize_gadm_1x1, planted_lat_tile_list) pool.close() pool.join() # # Creates 1x1 degree tiles of countries that have planted forests in them. # # For single processor use. # for tile in planted_lat_tile_list: # # plantation_preparation.rasterize_gadm_1x1(tile) # Creates a shapefile of the boundaries of the 1x1 GADM tiles in countries with planted forests os.system('''gdaltindex {0}_{1}.shp GADM_*.tif'''.format( cn.pattern_gadm_1x1_index, uu.date_time_today)) cmd = [ 'aws', 's3', 'cp', cn.docker_base_dir, cn.gadm_plant_1x1_index_dir, '--exclude', '*', '--include', '{}*'.format(cn.pattern_gadm_1x1_index), '--recursive' ] # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: uu.log_subprocess_output(process.stdout) # # Saves the 1x1 country extent tiles to s3 # # Only use if the entire process can't run in one go on the spot machine # cmd = ['aws', 's3', 'cp', cn.docker_base_dir, 's3://gfw2-data/climate/carbon_model/temp_spotmachine_output/', '--exclude', '*', '--include', 'GADM_*.tif', '--recursive'] # # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging # process = Popen(cmd, stdout=PIPE, stderr=STDOUT) # with process.stdout: # uu.log_subprocess_output(process.stdout) # Delete the aux.xml files os.system('''rm GADM*.tif.*''') # List of all 1x1 degree countey extent tiles created gadm_list_1x1 = uu.tile_list_spot_machine(".", "GADM_") uu.print_log( "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", gadm_list_1x1) uu.print_log(len(gadm_list_1x1)) ### Entry point 2: # If a shapefile of the boundaries of 1x1 degree tiles of countries with planted forests is supplied, # a list of the 1x1 tiles is created from the shapefile. # This avoids creating the 1x1 country extent tiles all over again because the relevant tile extent are supplied # in the shapefile. elif cn.gadm_plant_1x1_index_dir in args.gadm_tile_index: uu.print_log( "Country extent 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest tiles..." ) uu.print_log('{}/'.format(gadm_index_path)) # Copies the shapefile of 1x1 tiles of extent of countries with planted forests cmd = [ 'aws', 's3', 'cp', '{}/'.format(gadm_index_path), cn.docker_base_dir, '--recursive', '--exclude', '*', '--include', '{}*'.format(gadm_index_shp) ] # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: uu.log_subprocess_output(process.stdout) # Gets the attribute table of the country extent 1x1 tile shapefile gadm = glob.glob('{}*.dbf'.format(cn.pattern_gadm_1x1_index))[0] # Converts the attribute table to a dataframe dbf = Dbf5(gadm) df = dbf.to_dataframe() # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list gadm_list_1x1 = df['location'].tolist() gadm_list_1x1 = [str(y) for y in gadm_list_1x1] uu.print_log( "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", gadm_list_1x1) uu.print_log("There are", len(gadm_list_1x1), "1x1 country extent tiles to iterate through.") # In case some other arguments are provided else: uu.exception_log( 'Invalid GADM tile index shapefile provided. Please provide a valid shapefile.' ) # Creates 1x1 degree tiles of plantation growth wherever there are plantations. # Because this is iterating through all 1x1 tiles in countries with planted forests, it first checks # whether each 1x1 tile intersects planted forests before creating a 1x1 planted forest tile for that # 1x1 country extent tile. # 55 processors seems to use about 350 GB of memory, which seems fine. But there was some error about "PQconnectdb failed-- sorry, too many clients already". # So, moved the number of processors down to 48. # For multiprocessor use processes = 48 uu.print_log('Create 1x1 plantation from 1x1 gadm max processors=', processes) pool = Pool(processes) pool.map(plantation_preparation.create_1x1_plantation_from_1x1_gadm, gadm_list_1x1) pool.close() pool.join() # # Creates 1x1 degree tiles of plantation growth wherever there are plantations # # For single processor use # for tile in gadm_list_1x1: # # plantation_preparation.create_1x1_plantation(tile) # Creates a shapefile in which each feature is the extent of a plantation extent tile. # This index shapefile can be used the next time this process is run if starting with Entry Point 3. os.system('''gdaltindex {0}_{1}.shp plant_gain_*.tif'''.format( cn.pattern_plant_1x1_index, uu.date_time_today)) cmd = [ 'aws', 's3', 'cp', cn.docker_base_dir, cn.gadm_plant_1x1_index_dir, '--exclude', '*', '--include', '{}*'.format(cn.pattern_plant_1x1_index), '--recursive' ] # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: uu.log_subprocess_output(process.stdout) ### Entry point 3 # If a shapefile of the extents of 1x1 planted forest tiles is provided. # This is the part that actually creates the sequestration rate and forest type tiles. if cn.pattern_plant_1x1_index in args.planted_tile_index: uu.print_log( "Planted forest 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest growth rate and forest type tiles..." ) # Copies the shapefile of 1x1 tiles of extent of planted forests cmd = [ 'aws', 's3', 'cp', '{}/'.format(planted_index_path), cn.docker_base_dir, '--recursive', '--exclude', '*', '--include', '{}*'.format(planted_index_shp), '--recursive' ] # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: uu.log_subprocess_output(process.stdout) # Gets the attribute table of the planted forest extent 1x1 tile shapefile gadm = glob.glob('{}*.dbf'.format(cn.pattern_plant_1x1_index))[0] # Converts the attribute table to a dataframe dbf = Dbf5(gadm) df = dbf.to_dataframe() # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list planted_list_1x1 = df['location'].tolist() planted_list_1x1 = [str(y) for y in planted_list_1x1] uu.print_log( "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", planted_list_1x1) uu.print_log("There are", len(planted_list_1x1), "1x1 planted forest extent tiles to iterate through.") # Creates 1x1 degree tiles of plantation growth and type wherever there are plantations. # Because this is iterating through only 1x1 tiles that are known to have planted forests (from a previous run # of this script), it does not need to check whether there are planted forests in this tile. It goes directly # to intersecting the planted forest table with the 1x1 tile. # For single processor use #for tile in planted_list_1x1: # plantation_preparation.create_1x1_plantation_growth_from_1x1_planted(tile) # For multiprocessor use # processes=40 uses about 360 GB of memory. Works on r4.16xlarge with space to spare # processes=52 uses about 465 GB of memory (quite stably), so this is basically the max. num_of_processes = 52 pool = Pool(num_of_processes) pool.map( plantation_preparation. create_1x1_plantation_growth_from_1x1_planted, planted_list_1x1) pool.close() pool.join() # This works with 50 processors on an r4.16xlarge marchine. Uses about 430 GB out of 480 GB. num_of_processes = 52 pool = Pool(num_of_processes) processes = 50 uu.print_log('Create 1x1 plantation type max processors=', processes) pool = Pool(processes) pool.map( plantation_preparation.create_1x1_plantation_type_from_1x1_planted, planted_list_1x1) pool.close() pool.join() # This rasterizes the plantation removal factor standard deviations # processes=50 peaks at about 450 GB num_of_processes = 50 pool = Pool(num_of_processes) pool.map( plantation_preparation. create_1x1_plantation_stdev_from_1x1_planted, planted_list_1x1) pool.close() pool.join() ### All script entry points meet here: creation of 10x10 degree planted forest gain rate and rtpe tiles ### from 1x1 degree planted forest gain rate and type tiles # Name of the vrt of 1x1 planted forest gain rate tiles plant_gain_1x1_vrt = 'plant_gain_1x1.vrt' # Creates a mosaic of all the 1x1 plantation gain rate tiles uu.print_log("Creating vrt of 1x1 plantation gain rate tiles") os.system('gdalbuildvrt {} plant_gain_*.tif'.format(plant_gain_1x1_vrt)) # Creates 10x10 degree tiles of plantation gain rate by iterating over the set of pixel area tiles supplied # at the start of the script that are in latitudes with planted forests. # For multiprocessor use processes = 20 uu.print_log('Create 10x10 plantation gain rate max processors=', processes) pool = Pool(processes) pool.map( partial(plantation_preparation.create_10x10_plantation_gain, plant_gain_1x1_vrt=plant_gain_1x1_vrt), planted_lat_tile_list) pool.close() pool.join() # Creates 10x10 degree tiles of plantation gain rate by iterating over the set of pixel area tiles supplied #at the start of the script that are in latitudes with planted forests. # For single processor use #for tile in planted_lat_tile_list: # plantation_preparation.create_10x10_plantation_gain(tile, plant_gain_1x1_vrt) # Name of the vrt of 1x1 planted forest type tiles plant_type_1x1_vrt = 'plant_type_1x1.vrt' # Creates a mosaic of all the 1x1 plantation type tiles uu.print_log("Creating vrt of 1x1 plantation type tiles") os.system('gdalbuildvrt {} plant_type_*.tif'.format(plant_type_1x1_vrt)) # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied # at the start of the script that are in latitudes with planted forests. # For multiprocessor use num_of_processes = 26 pool = Pool(num_of_processes) uu.print_log('Create 10x10 plantation type max processors=', processes) pool.map( partial(plantation_preparation.create_10x10_plantation_type, plant_type_1x1_vrt=plant_type_1x1_vrt), planted_lat_tile_list) pool.close() pool.join() # # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied # at the start of the script that are in latitudes with planted forests. # # For single processor use # for tile in planted_lat_tile_list: # # plantation_preparation.create_10x10_plantation_type(tile, plant_type_1x1_vrt) # Name of the vrt of 1x1 planted forest gain rate standard deviation tiles plant_stdev_1x1_vrt = 'plant_stdev_1x1.vrt' # Creates a mosaic of all the 1x1 plantation gain rate standard deviation tiles uu.print_log( "Creating vrt of 1x1 plantation gain rate standard deviation tiles") os.system('gdalbuildvrt {} plant_stdev_*.tif'.format(plant_stdev_1x1_vrt)) # Creates 10x10 degree tiles of plantation gain rate standard deviation by iterating over the set of pixel area tiles supplied # at the start of the script that are in latitudes with planted forests. # For multiprocessor use num_of_processes = 26 pool = Pool(num_of_processes) pool.map( partial(plantation_preparation.create_10x10_plantation_gain_stdev, plant_stdev_1x1_vrt=plant_stdev_1x1_vrt), planted_lat_tile_list) pool.close() pool.join()
def _read_obs(self, stns_ids=None): # Saw extreme decreased performance due to garbage collection when # pandas ran checks for a chained assignment. Turn off this check # temporarily. opt_val = pd.get_option('mode.chained_assignment') pd.set_option('mode.chained_assignment', None) try: if stns_ids is None: stns_obs = self.stns else: stns_obs = self.stns.loc[stns_ids] nstns = len(stns_obs.station_id) nprocs = self.nprocs if nstns >= self.nprocs else nstns if self.has_start_end_dates: start_end = (self.start_date, self.end_date) else: start_end = None if nprocs > 1: # http://stackoverflow.com/questions/24171725/ # scikit-learn-multicore-attributeerror-stdin-instance- # has-no-attribute-close if not hasattr(sys.stdin, 'close'): def dummy_close(): pass sys.stdin.close = dummy_close iter_stns = [(None, a_id, self.elems, start_end) for a_id in stns_obs.station_id] pool = Pool(processes=nprocs) obs = pool.map(_parse_ghcnd_dly_star_remote, iter_stns) pool.close() pool.join() else: obs = [] for a_id in stns_obs.station_id: abuf = open_remote_file('https://www1.ncdc.noaa.gov/' 'pub/data/ghcn/daily/all/%s.dly' % a_id) obs_stn = _parse_ghcnd_dly(abuf, a_id, self.elems, start_end) obs.append(obs_stn) df_obs = pd.concat(obs, ignore_index=True) finally: pd.set_option('mode.chained_assignment', opt_val) df_obs = df_obs.set_index(['station_id', 'elem', 'time']) df_obs = df_obs.sortlevel(0, sort_remaining=True) return df_obs
def train(dataset, learn_rate=1e-4, prior_type='uniform', pretrained_ae_ckpt_path=None, pretrained_aae_ckpt_path=None): config = tf.ConfigProto() config.gpu_options.allow_growth = True logger.info("using prior: {}".format(prior_type)) pool_ = Pool(4) if dataset == 'MNIST': data = MNIST() data_name = "MNIST" w_init = "kaiming_uniform" encoder_dims = [500, 500, 1000, 10] discriminator_dims = [1000, 1] stack_ae = True update_interval = 100 update_aae_mu_interval = 10000 aae_finetune_iteration = 30000 initialize_iteration = 50000 finetune_iteration = 100000 finetune_epoch = 200 aae_finetune_epoch = 40 batch_size = 256 aae_ae_enhance = 1 elif dataset == "StackOverflow": data = StackOverflow() data_name = dataset encoder_dims = [500, 500, 2000, 20] discriminator_dims = [1000, 1] w_init = "glorot_uniform" stack_ae = False update_interval = 500 aae_finetune_iteration = 5000 update_aae_mu_interval = 5000 finetune_epoch = 15 aae_finetune_epoch = None batch_size = 64 aae_ae_enhance = 1 finetune_iteration = finetune_epoch * (data.train_y.shape[0] / batch_size) else: assert False, "Undefined dataset." logger.info("running on data set: {}".format(dataset)) dec_aae_model = DEC_AAE( params={ "encoder_dims": encoder_dims, "n_clusters": data.num_classes, "input_dim": data.feature_dim, "alpha": 1.0, "discriminator_dims": discriminator_dims, "learn_rate": learn_rate, "w_init": w_init }) if dataset == 'MNIST': # learning_rate = tf.train.exponential_decay(learning_rate=0.1, # global_step=tf.train.get_or_create_global_step(), # decay_steps=20000, # decay_rate=0.1, # staircase=True) # dec_aae_model.dec.ae.optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).\ # minimize(dec_aae_model.dec.ae.loss) dec_aae_model.dec.ae.optimizer = tf.train.AdamOptimizer(0.0001). \ minimize(dec_aae_model.dec.ae.loss) elif dataset == "StackOverflow": dec_aae_model.dec.ae.optimizer = tf.train.AdamOptimizer(0.001, beta1=0.9, beta2=0.999, epsilon=1e-8).\ minimize(dec_aae_model.dec.ae.loss) ae_saver = tf.train.Saver(var_list=dec_aae_model.ae_vars, max_to_keep=None) aae_saver = tf.train.Saver(var_list=dec_aae_model.d_vars + dec_aae_model.ae_vars, max_to_keep=None) dec_saver = tf.train.Saver(var_list=dec_aae_model.dec_vars, max_to_keep=None) saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=None) # phase 1: ae parameter initialization log_interval = 500 if pretrained_ae_ckpt_path is None: logger.info("pre training auto encoder") sae = StackedAutoEncoder(encoder_dims=encoder_dims, input_dim=data.feature_dim) ae_ckpt_path = os.path.join('ae_ckpt', 'model{}.ckpt'.format(data_name)) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) if stack_ae: # initialize sae next_ = data.gen_next_batch(batch_size=batch_size, is_train_set=True, iteration=initialize_iteration) cur_ae_data = data.train_x for i, sub_ae in enumerate(sae.layerwise_autoencoders): # train sub_ae for iter_, (batch_x, _, _) in enumerate(next_): _, loss = sess.run([sub_ae.optimizer, sub_ae.loss], feed_dict={ sub_ae.input_: batch_x, sub_ae.keep_prob: 0.8 }) if iter_ % log_interval == 0: logger.info("[SAE-{}] iter: {}\tloss: {}".format( i, iter_, loss)) # assign pretrained sub_ae's weight encoder_w_assign_op, encoder_b_assign_op = dec_aae_model.dec.ae.layers[ i].get_assign_ops(sub_ae.layers[0]) decoder_w_assign_op, decoder_b_assign_op = dec_aae_model.dec.ae.layers[ (i + 1) * -1].get_assign_ops(sub_ae.layers[1]) _ = sess.run([ encoder_w_assign_op, encoder_b_assign_op, decoder_w_assign_op, decoder_b_assign_op ]) # get next sub_ae's input cur_ae_data = sess.run(sub_ae.encoder, feed_dict={ sub_ae.input_: cur_ae_data, sub_ae.keep_prob: 1.0 }) embedding = Dataset(train_x=cur_ae_data, train_y=cur_ae_data) next_ = embedding.gen_next_batch( batch_size=batch_size, is_train_set=True, iteration=initialize_iteration) # finetune AE for iter_, (batch_x, _, _) in enumerate( data.gen_next_batch( batch_size=batch_size, is_train_set=True, # iteration=finetune_iteration, epoch=finetune_epoch)): _, loss = sess.run( [ dec_aae_model.dec.ae.optimizer, dec_aae_model.dec.ae.loss ], feed_dict={ dec_aae_model.dec.ae.input_: batch_x, dec_aae_model.dec.ae.keep_prob: 1.0 }) if iter_ % log_interval == 0: logger.info("[AE-finetune] iter: {}\tloss: {}".format( iter_, loss)) if iter_ % (10 * log_interval) == 0: xmlr_x = data.train_x[:10000, :] xmlr_id = data.train_y[:10000] z = sess.run(dec_aae_model.z, feed_dict={ dec_aae_model.input_: xmlr_x, dec_aae_model.keep_prob: 1.0 }) pool_.apply_async( pu.save_scattered_image, (z, xmlr_id, "./results/z_ae_map_{}.jpg".format(iter_))) # pu.save_scattered_image(z, xmlr_id, "./results/z_ae_map_{}.jpg".format(iter_)) ae_saver.save(sess, ae_ckpt_path) pool_.close() # 关闭进程池,表示不能在往进程池中添加进程 pool_.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用 exit() else: ae_ckpt_path = pretrained_ae_ckpt_path # exit() # phase 2: aae parameter initialization if pretrained_aae_ckpt_path is None: logger.info("pre training adversarial auto encoder") aae_ckpt_path = os.path.join('aae_ckpt', 'model{}.ckpt'.format(data_name)) # aae_ckpt_path = os.path.join('aae_ckpt', 'model.ckpt-100000') with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) ae_saver.restore(sess, ae_ckpt_path) # aae_saver.restore(sess, aae_ckpt_path) z = sess.run(dec_aae_model.z, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0 }) assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z) _ = sess.run(assign_mu_op) mu = sess.run(dec_aae_model.dec.mu) total_y = data.train_y total_pred = sess.run(dec_aae_model.dec.pred, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) logger.info("[Total DEC] epoch: {}\tacc: {}".format( -1, dec_aae_model.dec.cluster_acc(total_y, total_pred))) for iter_, (batch_x, batch_y, batch_idxs) in enumerate( data.gen_next_batch( batch_size=batch_size, is_train_set=True, # iteration=aae_finetune_iteration, epoch=aae_finetune_epoch, )): # if iter_ % update_aae_mu_interval == 0 and iter_ != 0: # z = sess.run(dec_aae_model.z, # feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0}) # assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z) # _ = sess.run(assign_mu_op) # mu = sess.run(dec_aae_model.dec.mu) z_sample, z_id_one_hot, z_id_ = \ prior.get_sample(prior_type, batch_size, dec_aae_model.z_dim, n_labels=data.num_classes, mu=mu) train_dec_feed = { dec_aae_model.input_: batch_x, dec_aae_model.batch_size: batch_x.shape[0], dec_aae_model.keep_prob: 1, dec_aae_model.z_sample: z_sample, } # if iter_ < 100: # # discriminator loss # _, d_loss = sess.run( # (dec_aae_model.train_op_d, dec_aae_model.D_loss), feed_dict=train_dec_feed) # logger.info("[ADVER] epoch %d: d_loss %03.2f" % ( # iter_, d_loss)) # continue for _ in range(aae_ae_enhance): # reconstruction loss _, ae_loss = sess.run( (dec_aae_model.train_op_ae, dec_aae_model.ae_loss), feed_dict=train_dec_feed) # # discriminator loss _, d_loss = sess.run( (dec_aae_model.train_op_d, dec_aae_model.D_loss), feed_dict=train_dec_feed) # # generator loss _, g_loss = sess.run( (dec_aae_model.train_op_g, dec_aae_model.G_loss), feed_dict=train_dec_feed) # tot_loss = ae_loss + d_loss + g_loss # if iter_ % 500 == 0: # logger.info cost every epoch logger.info( "[ADVER] epoch %d: L_tot %03.4f L_likelihood %03.4f d_loss %03.2f g_loss %03.4f" % (iter_, tot_loss, ae_loss, d_loss, g_loss)) if iter_ % 2500 == 0: # logger.info cost every epoch xmlr_x = data.train_x[:10000, :] xmlr_id = data.train_y[:10000] z = sess.run(dec_aae_model.z, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0 }) # pu.save_scattered_image(z, xmlr_id, "./results/z_map_{}.jpg".format(iter_)) # pred_y = sess.run(dec_aae_model.dec.pred, # feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0, # dec_aae_model.batch_size: data.train_x.shape[0] # }) # logger.info("[Total DEC] iteration: {}\targ_acc: {}". # format(iter_, dec_aae_model.dec.cluster_acc(data.train_y, pred_y))) # # kmeans = KMeans(n_clusters=data.num_classes, n_init=20) # pred_y = kmeans.fit_predict(z) # logger.info("[Total DEC] iteration: {}\tkmeans_acc: {}". # format(iter_, dec_aae_model.dec.cluster_acc(data.train_y, pred_y))) z = z[:10000] pool_.apply_async( pu.save_scattered_image, (z, xmlr_id, "./results/z_aae_map_{}.jpg".format(iter_))) aae_saver.save(sess, aae_ckpt_path) pool_.close() # 关闭进程池,表示不能在往进程池中添加进程 pool_.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用 exit() else: aae_ckpt_path = pretrained_aae_ckpt_path # phase 3: parameter optimization dec_ckpt_path = os.path.join('dec_ckpt', 'model{}.ckpt'.format(data_name)) t_ckpt_path = os.path.join('adver_ckpt', 'model{}.ckpt'.format(data_name)) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) retrain = False dec_mode = True idec_mode = False adec_mode = False best_score = 0. if dec_mode or idec_mode: if retrain: logger.info("retraining the dec") saver.restore(sess, t_ckpt_path) bais = 100 else: logger.info("training the dec") ae_saver.restore(sess, ae_ckpt_path) bais = 0 # initialize mu z = sess.run(dec_aae_model.z, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0 }) assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op( z) _ = sess.run(assign_mu_op) # xmlr_x = data.train_x[:10000, :] # xmlr_id = data.train_y[:10000] # z, xmlr_pred_id = sess.run([dec_aae_model.z, dec_aae_model.dec.pred], # feed_dict={dec_aae_model.input_: xmlr_x, dec_aae_model.keep_prob: 1.0, # dec_aae_model.batch_size: xmlr_x.shape[0]}) # pool_.apply_async(pu.save_scattered_image, # (z, xmlr_id, "./results/z_init_map_{}.jpg".format(0 + bais), xmlr_pred_id)) # pool_.close() # 关闭进程池,表示不能在往进程池中添加进程 # pool_.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用 # exit() total_y = data.train_y total_pred = sess.run(dec_aae_model.dec.pred, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) logger.info("[Total DEC] epoch: {}\tacc: {}".format( -1, dec_aae_model.dec.cluster_acc(total_y, total_pred))) # print("sstart") # total_y = total_y[:10000] # z = z[:10000] # from sklearn.manifold import TSNE # z = TSNE(n_components=2, learning_rate=100).fit_transform(z) # kmeans = KMeans(n_clusters=data.num_classes, n_init=20) # pred_y = kmeans.fit_predict(z) # print(pu.cluster_acc(total_y, pred_y)) # exit() else: if retrain: logger.info("retraining the adec") bais = 100 saver.restore(sess, t_ckpt_path) else: logger.info("training the adec") aae_saver.restore(sess, aae_ckpt_path) # ae_saver.restore(sess, ae_ckpt_path) bais = 0 # initialize mu z = sess.run(dec_aae_model.z, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0 }) assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op( z) _ = sess.run(assign_mu_op) total_y = data.train_y total_pred = sess.run(dec_aae_model.dec.pred, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) logger.info("[Total ADEC] epoch: {}\tacc: {}".format( -1, dec_aae_model.dec.cluster_acc(total_y, total_pred))) pool_.apply_async(pu.save_scattered_image, (z[:10000, ], total_y[:10000], "./results/z_adec_map_{}.jpg".format(-1), total_pred[:10000])) mu = sess.run(dec_aae_model.dec.mu) p = None for cur_epoch in range(100): for iter_, (batch_x, batch_y, batch_idxs) in enumerate( data.gen_next_batch( batch_size=batch_size, is_train_set=True, epoch=1, # iteration=50000 )): if cur_epoch % 10 == 0 and iter_ == 0: q = sess.run(dec_aae_model.dec.q, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) p = dec_aae_model.dec.target_distribution(q) # if (iter_+1) % 10000 == 0: # z = sess.run(dec_aae_model.z, # feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0}) # assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z) # _ = sess.run(assign_mu_op) # mu = sess.run(dec_aae_model.dec.mu) batch_p = p[batch_idxs] train_dec_feed = { dec_aae_model.input_: batch_x, dec_aae_model.batch_size: batch_x.shape[0], dec_aae_model.dec.p: batch_p, dec_aae_model.keep_prob: 1., } # ==========================adversial part ============================ z_sample, z_id_one_hot, z_id_ = \ prior.get_sample(prior_type, batch_size, dec_aae_model.z_dim, n_labels=data.num_classes, mu=mu) train_dec_feed.update({ dec_aae_model.z_sample: z_sample, }) # ==========================adversial part ============================ if dec_mode: # logger.info("DEC mode") _, loss, pred = sess.run([ dec_aae_model.train_op_dec, dec_aae_model.dec_loss, dec_aae_model.dec.pred ], feed_dict=train_dec_feed) elif idec_mode: # logger.info("IDEC mode") _, loss, pred = sess.run([ dec_aae_model.train_op_idec, dec_aae_model.idec_loss, dec_aae_model.dec.pred ], feed_dict=train_dec_feed) elif adec_mode: # logger.info("ADEC mode") _, loss, pred = sess.run([ dec_aae_model.train_op_adec, dec_aae_model.adec_loss, dec_aae_model.dec.pred ], feed_dict=train_dec_feed) ae_loss, g_loss, d_loss = \ sess.run([dec_aae_model.ae_loss, dec_aae_model.G_loss, dec_aae_model.D_loss], feed_dict=train_dec_feed) tot_loss = ae_loss + g_loss + d_loss else: raise ValueError("没有这个模式!") # if iter_ % 100 == 0: # logger.info cost every epoch # logger.info("[ADVER] epoch %d: L_tot %03.2f L_likelihood %03.2f d_loss %03.2f g_loss %03.2f" % ( # cur_epoch, tot_loss, ae_loss, d_loss, g_loss)) # ==========================adversial part ============================ # logger.info("[DEC] epoch: {}\tloss: {}\tacc: {}".format(cur_epoch+bais, loss, # dec_aae_model.dec.cluster_acc(batch_y, pred))) if iter_ % 2500 == 0: total_y = data.train_y total_pred = sess.run(dec_aae_model.dec.pred, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) now_score = pu.cluster_acc(total_y, total_pred) now_nmi = pu.cluster_nmi(total_y, total_pred) if adec_mode: logger.info( "[ADVER] epoch %d: L_tot %03.4f L_likelihood %03.4f d_loss %03.2f g_loss %03.4f" % (cur_epoch, tot_loss, ae_loss, d_loss, g_loss)) logger.info( "[Total DEC] iteration: {}\tloss: {}\tacc: {}\tnmi: {}" .format(iter_, loss, now_score, now_nmi)) if now_score > best_score: best_score = now_score saver.save(sess, t_ckpt_path) if iter_ % 5000 == 0: xmlr_x = data.train_x[:10000, :] xmlr_id = data.train_y[:10000] z, xmlr_pred_id = sess.run( [dec_aae_model.z, dec_aae_model.dec.pred], feed_dict={ dec_aae_model.input_: xmlr_x, dec_aae_model.keep_prob: 1.0, dec_aae_model.batch_size: xmlr_x.shape[0] }) pool_.apply_async( pu.save_scattered_image, (z, xmlr_id, "./results/z_adec_map_{}.jpg".format(iter_), xmlr_pred_id)) total_y = data.train_y total_pred = sess.run(dec_aae_model.dec.pred, feed_dict={ dec_aae_model.input_: data.train_x, dec_aae_model.batch_size: data.train_x.shape[0], dec_aae_model.keep_prob: 1.0 }) logger.info("[Total DEC] epoch: {}\tloss: {}\tacc: {}".format( cur_epoch + bais, loss, dec_aae_model.dec.cluster_acc(total_y, total_pred))) # dec_saver.save(sess, dec_ckpt_path) pool_.close() # 关闭进程池,表示不能在往进程池中添加进程 pool_.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用
def peak__partition(v, s1, s2, find_maxima=True, partition_op=None, multiprocessing_process_num=0): """ partition the volume then detect peaks for each partition note that this will result in redundant peaks!! Clean up must be done afterwards!! """ import aitom.image.vol.partition as IVP if multiprocessing_process_num > 0: pool = Pool(processes=min(multiprocessing_process_num, multiprocessing.cpu_count())) else: pool = None if partition_op is None: # in this case, just generate a single partition siz_max = max(v.shape) partition_op = { 'nonoverlap_width': siz_max * 2, 'overlap_width': siz_max * 2 } b = IVP.gen_bases(v.shape, nonoverlap_width=partition_op['nonoverlap_width'], overlap_width=partition_op['overlap_width']) print('partition num', b.shape) ps = [] if pool is not None: pool_re = [] for i0 in range(b.shape[0]): for i1 in range(b.shape[1]): for i2 in range(b.shape[2]): bp = N.squeeze(b[i0, i1, i2, :, :]) pool_re.append( pool.apply_async( func=peak__partition__single_job, kwds={ 'v': v[bp[0, 0]:bp[0, 1], bp[1, 0]:bp[1, 1], bp[2, 0]:bp[2, 1]], 's1': s1, 's2': s2, 'base': bp, 'find_maxima': find_maxima, 'partition_id': (i0, i1, i2), 'save_vg': (partition_op['save_vg'] if 'save_vg' in partition_op else False) })) for pool_re_t in pool_re: ppsj = pool_re_t.get(9999999) ps.extend(ppsj['ps']) print('\r', ppsj['partition_id'], ' ') sys.stdout.flush() pool.close() pool.join() del pool else: for i0 in range(b.shape[0]): for i1 in range(b.shape[1]): for i2 in range(b.shape[2]): bp = N.squeeze(b[i0, i1, i2, :, :]) ppsj = peak__partition__single_job( v=v[bp[0, 0]:bp[0, 1], bp[1, 0]:bp[1, 1], bp[2, 0]:bp[2, 1]], s1=s1, s2=s2, base=bp, find_maxima=find_maxima, partition_id=(i0, i1, i2), save_vg=(partition_op['save_vg'] if 'save_vg' in partition_op else False)) ps.extend(ppsj['ps']) print('\r', ppsj['partition_id'], ' ') sys.stdout.flush() # order peaks in ps according to values if find_maxima: ps = sorted(ps, key=lambda _: (-_['val'])) else: ps = sorted(ps, key=lambda _: _['val']) return ps
import time import os from multiprocessing.pool import Pool def action1(a, b=50): for i in range(b): print(a, os.getpid(), ' ', i) # os.getpid(): pid简单来说就是每个进程的“身份证” time.sleep(0.1) if __name__ == '__main__': # 还要添加这行,否则可能出现异常 ci = Pool(3) # 创建一个进程池,容量为3个进程 ci.apply_async(action1, args=('进程一', )) # 启动第一个子进程... ci.apply_async(action1, args=('进程二', 50)) # 和普通进程的启动方式有很大不同仔细看 ci.apply_async(action1, args=('进程三', 60)) # Pool的最基本格式记住← # 注意:程序现在有4个进程在运行:上面的三个子进程 和一个最为核心的:主进程 ci.close() # 关闭进程池(但池子内已启动的子进程还会继续进行) ci.join() # 等待进程池内的所有子进程完毕 print('比如说这最后的一行输出就是主进程执行任务打印出来的')
def _execute_sub_tasks(task_id, params, sig_content, verbosity, runmode, sigmode, monitor_interval, resource_monitor_interval): '''If this is a master task, execute as individual tasks''' m = ProcessMonitor( task_id, monitor_interval=monitor_interval, resource_monitor_interval=resource_monitor_interval, max_walltime=params.sos_dict['_runtime'].get('max_walltime', None), max_mem=params.sos_dict['_runtime'].get('max_mem', None), max_procs=params.sos_dict['_runtime'].get('max_procs', None), sos_dict=params.sos_dict) m.start() env.logger.info(f'{task_id} ``started``') master_out = os.path.join(os.path.expanduser('~'), '.sos', 'tasks', task_id + '.out') master_err = os.path.join(os.path.expanduser('~'), '.sos', 'tasks', task_id + '.err') # if this is a master task, calling each sub task with open(master_out, 'wb') as out, open(master_err, 'wb') as err: def copy_out_and_err(result): tid = result['task'] out.write( f'{tid}: {"completed" if result["ret_code"] == 0 else "failed"}\n' .encode()) if 'output' in result: out.write(f'output: {result["output"]}\n'.encode()) sub_out = os.path.join(os.path.expanduser('~'), '.sos', 'tasks', tid + '.out') if os.path.isfile(sub_out): with open(sub_out, 'rb') as sout: out.write(sout.read()) try: os.remove(sub_out) except Exception as e: env.logger.warning(f'Failed to remove {sub_out}: {e}') sub_err = os.path.join(os.path.expanduser('~'), '.sos', 'tasks', tid + '.err') if 'exception' in result: err.write(str(result['exception']).encode()) err.write( f'{tid}: {"completed" if result["ret_code"] == 0 else "failed"}\n' .encode()) if os.path.isfile(sub_err): with open(sub_err, 'rb') as serr: err.write(serr.read()) try: os.remove(sub_err) except Exception as e: env.logger.warning(f'Failed to remove {sub_err}: {e}') # remove other files as well try: remove_task_files(tid, ['.out', '.err']) except Exception as e: env.logger.debug(f'Failed to remove files {tid}: {e}') if params.num_workers > 1: from multiprocessing.pool import Pool p = Pool(params.num_workers) results = [] for t in params.task_stack: results.append( p.apply_async(_execute_task, ((*t, { t[0]: sig_content.get(t[0], {}) }), verbosity, runmode, sigmode, None, None), callback=copy_out_and_err)) for idx, r in enumerate(results): results[idx] = r.get() p.close() p.join() # we wait for all results to be ready to return or raise # but we only raise exception for one of the subtasks # for res in results: # if 'exception' in res: # failed = [x.get("task", "") # for x in results if "exception" in x] # env.logger.error( # f'{task_id} ``failed`` due to failure of subtask{"s" if len(failed) > 1 else ""} {", ".join(failed)}') # return {'ret_code': 1, 'exception': res['exception'], 'task': task_id} else: results = [] for tid, tdef in params.task_stack: # no monitor process for subtasks res = _execute_task((tid, tdef, { tid: sig_content.get(tid, {}) }), verbosity=verbosity, runmode=runmode, sigmode=sigmode, monitor_interval=None, resource_monitor_interval=None) try: copy_out_and_err(res) except Exception as e: env.logger.warning( f'Failed to copy result of subtask {tid}: {e}') results.append(res) # for res in results: # if 'exception' in res: # failed = [x.get("task", "") # for x in results if "exception" in x] # env.logger.error( # f'{task_id} ``failed`` due to failure of subtask{"s" if len(failed) > 1 else ""} {", ".join(failed)}') # return {'ret_code': 1, 'exception': res['exception'], 'task': task_id} # # now we collect result all_res = { 'ret_code': 0, 'output': None, 'subtasks': {}, 'shared': {}, 'skipped': 0, 'signature': {} } for tid, x in zip(params.task_stack, results): all_res['subtasks'][tid[0]] = x if 'exception' in x: all_res['exception'] = x['exception'] all_res['ret_code'] += 1 continue all_res['ret_code'] += x['ret_code'] if all_res['output'] is None: all_res['output'] = x['output'] else: try: all_res['output'].extend(x['output'], keep_groups=True) except Exception as e: env.logger.warning( f"Failed to extend output {all_res['output']} with {x['output']}" ) all_res['shared'].update(x['shared']) # does not care if one or all subtasks are executed or skipped. all_res['skipped'] += x.get('skipped', 0) if 'signature' in x: all_res['signature'].update(x['signature']) if all_res['ret_code'] != 0: if all_res['ret_code'] == len(results): env.logger.info( f'All {len(results)} tasks in {task_id} ``failed``') else: env.logger.info( f'{all_res["ret_code"]} of {len(results)} tasks in {task_id} ``failed``' ) # if some failed, some skipped, not skipped if 'skipped' in all_res: all_res.pop('skipped') elif all_res['skipped']: if all_res['skipped'] == len(results): env.logger.info( f'All {len(results)} tasks in {task_id} ``ignored`` or skipped' ) else: # if only partial skip, we still save signature and result etc env.logger.info( f'{all_res["skipped"]} of {len(results)} tasks in {task_id} ``ignored`` or skipped' ) all_res.pop('skipped') else: env.logger.info(f'All {len(results)} tasks in {task_id} ``completed``') return all_res
# Parameters process_num = 24 image_size = (512, 512) url = 'http://v18.proteinatlas.org/images/' csv_path = "../input/HPAv18RBGY_wodpl.csv" save_dir = "./external_data" # Create the directory to save the images in case it doesn't exist try: os.makedirs(save_dir) except OSError as exc: if exc.errno != errno.EEXIST: raise pass print('Parent process %s.' % os.getpid()) img_list = pd.read_csv(csv_path)['Id'] list_len = len(img_list) p = Pool(process_num) for i in range(process_num): start = int(i * list_len / process_num) end = int((i + 1) * list_len / process_num) process_images = img_list[start:end] p.apply_async( download, args=(str(i), process_images, url, save_dir, image_size) ) print('Waiting for all subprocesses done...') p.close() # 调用close()之后不能继续添加新的Prpcess p.join() # p.join()之前必须等待所有的子进程执行完毕 print('All subprocesses done.')
def pos_type_classify(bamfile, chrom, start, end, is_single, read_length, temp_dir, extension=None, center=True, maxsize=None, process=20, minmapq=0, is_multmapfilter=False): print bamfile, chrom, start, end, is_single, read_length, temp_dir, extension, center if is_single: total_reads_type6_left = [ ] # 6. in left place of del and second read is on the breakpoint total_reads_type6_right = [ ] # 6. in right place of del and first read is on the breakpoint total_reads_type7 = [] # 7. reads within the del # temp_prefix = "%s/classify_%s" % (temp_dir, sub_num) if extension: rel_start = start - extension rel_end = end + extension else: rel_start = start rel_end = end if center: reads_type6_left, reads_type6_right, reads_type7, filtered_reads_num = posType_sub_single( bamfile, chrom, rel_start, rel_end, start, end, minmapq, is_multmapfilter) else: rel_start_left = rel_start rel_end_left = start + maxsize rel_start_right = end - maxsize rel_end_right = rel_end reads_type6_left_1, reads_type6_right_1, reads_type7_1, filtered_reads_num_1 = posType_sub_single( bamfile, chrom, rel_start_left, rel_end_left, start, end, minmapq, is_multmapfilter) reads_type6_left_2, reads_type6_right_2, reads_type7_2, filtered_reads_num_2 = posType_sub_single( bamfile, chrom, rel_start_right, rel_end_right, start, end, minmapq, is_multmapfilter) reads_type6_left = reads_type6_left_1 + reads_type6_right_1 reads_type6_right = reads_type6_right_1 + reads_type6_right_2 reads_type7 = reads_type7_1 + reads_type7_2 filtered_reads_num = filtered_reads_num_1 + filtered_reads_num_2 total_reads_type6_left.extend(reads_type6_left) total_reads_type6_right.extend(reads_type6_right) total_reads_type7.extend(reads_type7) total_filtered_reads = filtered_reads_num print total_reads_type6_left, total_reads_type6_right, total_reads_type7, total_filtered_reads return total_reads_type6_left, total_reads_type6_right, total_reads_type7, total_filtered_reads else: total_reads_type1_left = [ ] # 1. in left place of del and second read is on the breakpoint total_reads_type1_right = [ ] # 1. in right place of del and first read is on the breakpoint total_reads_type2_left = [ ] # 2. in left place of del and first read is on the breakpoint total_reads_type2_right = [ ] # 2. in right place of del and second read is on the breakpoint total_reads_type3_left = [ ] # 3. in left place of del and first read and right read is crossover breakpoint with no intersection total_reads_type3_right = [ ] # 3. in right place of del and first read and right read is crossover breakpoint with no intersection total_reads_type4 = [] # 4. reads within the del total_reads_type5_left = [ ] # 5. in left place of del and first read and right read are all has intersection total_reads_type5_right = [ ] # 3. in right place of del and first read and right read are all has intersection total_filtered_reads = 0 length = end - start + 1 sub_num = length / read_length # when start = end, translocation of chromosome if start == end: rel_start = start - maxsize rel_end = end + maxsize print rel_start, rel_end # temp_prefix = "%s/classify_%s" % (temp_dir, "whole") (reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left, reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num) \ = posType_sub_paired(bamfile, chrom, rel_start, rel_end, start, end, read_length, minmapq, is_multmapfilter, extension=extension) total_reads_type1_left.extend(reads_type1_left) total_reads_type1_right.extend(reads_type1_right) total_reads_type2_left.extend(reads_type2_left) total_reads_type2_right.extend(reads_type2_right) total_reads_type3_left.extend(reads_type3_left) total_reads_type3_right.extend(reads_type3_right) total_reads_type4.extend(reads_type4) total_reads_type5_left.extend(reads_type5_left) total_reads_type5_right.extend(reads_type5_right) total_filtered_reads = filtered_reads_num # end - start < read_length and there is no need to extend its scope elif sub_num == 0 and not extension: # temp_prefix = "%s/classify_%s" % (temp_dir, sub_num) (reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left, reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num) \ = posType_sub_paired(bamfile, chrom, start, end, start, end, read_length, minmapq, is_multmapfilter, extension=extension) total_reads_type1_left.extend(reads_type1_left) total_reads_type1_right.extend(reads_type1_right) total_reads_type2_left.extend(reads_type2_left) total_reads_type2_right.extend(reads_type2_right) total_reads_type3_left.extend(reads_type3_left) total_reads_type3_right.extend(reads_type3_right) total_reads_type4.extend(reads_type4) total_reads_type5_left.extend(reads_type5_left) total_reads_type5_right.extend(reads_type5_right) total_filtered_reads = filtered_reads_num # there should be more than one process to calculate. else: run_pool = Pool(process) result_list = [] # extension the range to cover whole reads if extension: rel_start = start - extension rel_end = end + extension length = rel_end - rel_start + 1 sub_num = length / read_length else: rel_start = start rel_end = end # if center should be consider or center is no need to consider, but the center size is too less if center or (not center and maxsize is not None and length < maxsize * 2): for i in range(sub_num): sub_start = i * read_length + rel_start if i == sub_num - 1: sub_end = rel_end else: sub_end = sub_start + 1 print "Sub Process: %s" % i, sub_start, sub_end result_list.append( run_pool.apply_async( posType_sub_paired, args=(bamfile, chrom, sub_start, sub_end, start, end, read_length, minmapq, is_multmapfilter, extension))) run_pool.close() run_pool.join() # if center is no need to consider else: rel_start_left = rel_start rel_end_left = start + maxsize rel_start_right = end - maxsize rel_end_right = rel_end # print rel_start_left, rel_end_left, rel_start_right, rel_end_right length = rel_end_left - rel_start_left + 1 sub_num = length / read_length for i in range(sub_num): sub_start = i * read_length + rel_start_left if i == sub_num - 1: sub_end = rel_end_left else: sub_end = sub_start + 1 print "Sub Process: %s" % i, sub_start, sub_end # temp_prefix = "%s/classify_%s" % (temp_dir, i) result_list.append( run_pool.apply_async( posType_sub_paired, args=(bamfile, chrom, sub_start, sub_end, start, end, read_length, minmapq, is_multmapfilter, extension))) length = rel_end_right - rel_start_right + 1 sub_num = length / read_length for i in range(sub_num): sub_start = i * read_length + rel_start_right if i == sub_num - 1: sub_end = rel_end_right else: sub_end = sub_start + 1 print "Sub Process: %s" % i, sub_start, sub_end # temp_prefix = "%s/classify_%s" % (temp_dir, i) result_list.append( run_pool.apply_async( posType_sub_paired, args=(bamfile, chrom, sub_start, sub_end, start, end, read_length, minmapq, is_multmapfilter, extension))) run_pool.close() run_pool.join() for res in result_list: reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left, reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num = res.get( ) total_reads_type1_left.extend(reads_type1_left) total_reads_type1_right.extend(reads_type1_right) total_reads_type2_left.extend(reads_type2_left) total_reads_type2_right.extend(reads_type2_right) total_reads_type3_left.extend(reads_type3_left) total_reads_type3_right.extend(reads_type3_right) total_reads_type4.extend(reads_type4) total_reads_type5_left.extend(reads_type5_left) total_reads_type5_right.extend(reads_type5_right) total_filtered_reads += filtered_reads_num print "type1_left: %s; type1_right: %s, type2_left: %s; type2_right: %s, type3_left: %s; " \ "type3_right: %s, type4: %s; type5_left: %s; type5_right: %s" % ( len(total_reads_type1_left), len(total_reads_type1_right), len(total_reads_type2_left), len(total_reads_type2_right), len(total_reads_type3_left), len(total_reads_type3_right), len(total_reads_type4), len(total_reads_type5_left), len(total_reads_type5_right)) print "total_filtered_reads: %s" % total_filtered_reads return total_reads_type1_left, total_reads_type1_right, total_reads_type2_left, total_reads_type2_right, total_reads_type3_left, total_reads_type3_right, total_reads_type4, total_reads_type5_left, total_reads_type5_right, total_filtered_reads
def aggregate_scores(test_ref_pairs, evaluator=NiftiEvaluator, labels=None, nanmean=True, json_output_file=None, json_name="", json_description="", json_author="Fabian", json_task="", num_threads=2, **metric_kwargs): """ test = predicted image :param test_ref_pairs: :param evaluator: :param labels: must be a dict of int-> str or a list of int :param nanmean: :param json_output_file: :param json_name: :param json_description: :param json_author: :param json_task: :param metric_kwargs: :return: """ if type(evaluator) == type: evaluator = evaluator() if labels is not None: evaluator.set_labels(labels) all_scores = OrderedDict() all_scores["all"] = [] all_scores["mean"] = OrderedDict() test = [i[0] for i in test_ref_pairs] ref = [i[1] for i in test_ref_pairs] p = Pool(num_threads) all_res = p.map( run_evaluation, zip(test, ref, [evaluator] * len(ref), [metric_kwargs] * len(ref))) p.close() p.join() for i in range(len(all_res)): all_scores["all"].append(all_res[i]) # append score list for mean for label, score_dict in all_res[i].items(): if label in ("test", "reference"): continue if label not in all_scores["mean"]: all_scores["mean"][label] = OrderedDict() for score, value in score_dict.items(): if score not in all_scores["mean"][label]: all_scores["mean"][label][score] = [] all_scores["mean"][label][score].append(value) for label in all_scores["mean"]: for score in all_scores["mean"][label]: if nanmean: all_scores["mean"][label][score] = float( np.nanmean(all_scores["mean"][label][score])) else: all_scores["mean"][label][score] = float( np.mean(all_scores["mean"][label][score])) # save to file if desired # we create a hopefully unique id by hashing the entire output dictionary if json_output_file is not None: json_dict = OrderedDict() json_dict["name"] = json_name json_dict["description"] = json_description timestamp = datetime.today() json_dict["timestamp"] = str(timestamp) json_dict["task"] = json_task json_dict["author"] = json_author json_dict["results"] = all_scores json_dict["id"] = hashlib.md5( json.dumps(json_dict).encode("utf-8")).hexdigest()[:12] save_json(json_dict, json_output_file) return all_scores
def run(self, dimension, stage_idx, prev_stage_value=0, num_population=100, num_generations=100, elite_ratio=0.05, parents_ratio=0.15, ratio_decay=1, num_finetune=1, best_sol_1st=None): num_generations = num_generations num_population = num_population num_elite = int(num_population * elite_ratio) pool = Pool(min(num_population + num_elite, cpu_count())) best_reward_list = [] best_reward = [-float("Inf") for _ in range(len(self.fitness))] best_sol = None population = [ self.create_genome_fixedSL() for _ in range(num_population) ] if ((stage_idx == 0) or (best_sol_1st is None)) else [ best_sol_1st for _ in range(num_population) ] fitness = np.ones((num_population, len(self.fitness)), float) num_parents = num_population for g in range(num_generations): finetine_iter = 1 if g < num_generations // 2 else num_finetune for f in range(finetine_iter): is_finetune = f > 0 gen_best = -float("Inf") gen_best_idx = 0 count_non_valid = 0 if num_parents < 1: # restart population = [ self.create_genome_fixedSL() for _ in range(num_population) ] if ((stage_idx == 0) or (best_sol_1st is None)) else [ best_sol_1st for _ in range(num_population) ] fitness = np.ones((num_population, len(self.fitness)), float) print("Reinitialize population") num_parents = num_population population, fitness, parents = self.select_parents( population, fitness, num_parents, num_population, stage_idx, first_stage_value=prev_stage_value) elite = copy.deepcopy(parents[:num_elite]) elite_fitness = copy.deepcopy(fitness[:(len(elite))]) if is_finetune: self.mutate_tile(population, num_mu_loc=3, range_alpha=0.1, alpha=0.52, is_finetune=True) else: self.crossover_tile(parents, population, alpha=0.57) self.mutate_tile(population, num_mu_loc=3, range_alpha=0.53, alpha=0.52, is_finetune=False) self.swap_order(population, alpha=0.47) self.born_cluster(population, alpha=0.57) self.kill_cluster(population, alpha=0.27) population = elite + population fitness = np.concatenate((elite_fitness, fitness)) reward_list = pool.map(self.thread_fun, population) for i in range(len(population)): reward = reward_list[i] if reward is None or any(np.array(reward) >= 0): reward = [ float("-Inf") for _ in range(len(best_reward)) ] count_non_valid += 1 elif stage_idx > 0: if any([ reward[kk] < prev_stage_value[kk] for kk in range(len(prev_stage_value)) ]): reward = [ float("-Inf") for _ in range(len(best_reward)) ] count_non_valid += 1 judging_reward = reward[stage_idx] fitness[i] = reward if gen_best < judging_reward: gen_best = judging_reward gen_best_idx = i judging_best_reward = best_reward[stage_idx] if judging_best_reward < gen_best: best_reward = copy.deepcopy(fitness[gen_best_idx]) best_sol = copy.deepcopy(population[gen_best_idx]) num_parents = int(num_population * parents_ratio) num_parents = min(num_parents, len(population) - count_non_valid) parents_ratio *= ratio_decay best_reward_list.append(best_reward) chkpt = { "best_reward": best_reward, "best_reward_list": best_reward_list, "best_sol": best_sol, "num_population": num_population, "num_generations": num_generations, "fitness_use": self.fitness, "num_pe": self.num_pe, "l1_size": self.l1_size, "l2_size": self.l2_size, "NocBW": self.NocBW, "dimension": dimension } if self.log_level == 2: print( "[Stage {}]Gen {}: Gen reward: {:3e}, 1st stage Reward: {}, Best reward: {}, Non_valid: {}" .format(stage_idx + 1, (g + 1), gen_best, np.abs(prev_stage_value), np.abs(best_reward), count_non_valid)) elif self.log_level == 1: if stage_idx == 0: print("[Stage {}]Gen {}: Best reward: {}".format( stage_idx + 1, (g + 1), np.abs(best_reward)[0])) else: print( "[Stage {}]Gen {}: 1st stage Reward: {}, Best reward: {}" .format(stage_idx + 1, (g + 1), np.abs(prev_stage_value), np.abs(best_reward))) pool.close() return chkpt