def test_error(): with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) assert raises(TypeError, lambda: c.get({'x': 1, 'y': (inc, 'x', 'x')}, 'y')) assert 'y' not in s.data c.close()
def test_get_with_dill(): with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) dsk = {'x': 1, 'y': (partial(add, 1), 'x')} keys = 'y' assert c.get(dsk, keys) == 2 c.close()
def test_get(): with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) dsk = {'x': 1, 'y': (add, 'x', 'x'), 'z': (inc, 'y')} keys = ['y', 'z'] assert c.get(dsk, keys) == [2, 3] c.close()
def test_multiple_clients(): with scheduler_and_workers(1) as (s, (a,)): c = Client(s.address_to_clients) d = Client(s.address_to_clients) assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x') pool = ThreadPool(2) future1 = pool.apply_async(c.get, args=({'x': 1, 'y': (inc, 'x')}, 'y')) future2 = pool.apply_async(d.get, args=({'a': 1, 'b': (inc, 'a')}, 'b')) while not (future1.ready() and future2.ready()): sleep(1e-6) assert future1.get() == future2.get() c.close() d.close()
def test_multiple_clients(): with scheduler_and_workers(1) as (s, (a,)): c = Client(s.address_to_clients) d = Client(s.address_to_clients) assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x') def sleep_inc(x): sleep(0.5) return x + 1 pool = ThreadPool(2) future1 = pool.apply_async(c.get, args=({'x': 1, 'y': (sleep_inc, 'x')}, 'y')) future2 = pool.apply_async(d.get, args=({'a': 1, 'b': (sleep_inc, 'a')}, 'b')) assert future1.get() == future2.get() c.close() d.close()
def test_register_collections(): try: import dask.bag as db except ImportError: return with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) b = db.from_sequence(range(5), npartitions=2).map(inc) assert not s.collections c.set_collection('mybag', b) assert 'mybag' in s.collections d = Client(s.address_to_clients) b2 = d.get_collection('mybag') assert (type(b) == type(b2) and b.npartitions == b2.npartitions) assert list(b) == list(b2) c.close() d.close()
def run_JK_distributed(df, param, randomize=True): '''Receives the pandas dataframe with the objects containing the temperature decrements and the parameter object and run the kSZ statistic and generate Jack Knifes. Everything runs in the cluster, so current terminal does not need to request many cpus. df: dataframe object containing the variables for the calculation params: param file for this calculation NJK: how many subgroups we will make to run the calculation randomize: shuffle data before running the JK''' Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS resampling_method = param.JK_RESAMPLING_METHOD.lower() #setup cluster cluster = SGECluster(walltime='172800', processes=1, cores=1, env_extra=['#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(30) #end setting up cluster #send full dataset to the cluster future_fullDataset = client.scatter(df) future_params = client.scatter(param) res_fullDataset = client.submit(pairwiser.get_pairwise_ksz, future_fullDataset, future_params, multithreading=True) #done with the full dataset #iterate over partial dataset for the JK if JK == resampling_method: indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize) jk_results = [] futureData = [] #data to be sent in jk or bootstrap in galaxy space if (JK == resampling_method) or (BS == resampling_method): for j in range(Ngroups): # submit data to the cluster if JK in resampling_method: # if method jk dataJK = df.drop(indices_toDrop[j], inplace=False) futureData.append(client.scatter(dataJK)) elif BS in resampling_method: dataBS = df.sample(len(df), replace=True) futureData.append(client.scatter(dataBS)) #Now do the JK calculation for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if BS_PW == resampling_method: # submit the same dataset futureData = client.scatter(df, broadcast=True) for j in range(Ngroups): jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise, futureData, future_params, multithreading=True, pure=False)) if resampling_method == BS_DT: for j in range(Ngroups): df_bs = df.copy() choose = np.random.choice(len(df), len(df)) df_bs['dT'] = df.dT.values[choose] futureData.append(client.scatter(df_bs)) for j in range(Ngroups): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) if resampling_method == TL_JK: tiled_JK.classify_grid(df) df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5) Ntiles = tiled_JK.how_many_tiles(df) for j in range(Ntiles): df_tosubmit = tiled_JK.remove_tile(df, j) futureData.append(client.scatter(df_tosubmit)) for j in range(Ntiles): jk_results.append(client.submit(pairwiser.get_pairwise_ksz, futureData[j], future_params, multithreading=True)) #extract results fullDataset_results = res_fullDataset.result() jk_results = client.gather(jk_results) client.close() # cluster.close() return fullDataset_results, jk_results
class Loader(object): """ A loader to populate a Dask Dataframe with data from a Cassandra table. """ def __init__(self): """ Initialization of DaskCassandraLoader > DaskCassandraLoader() """ self.logger = logging.getLogger(__name__) self.error = None self.warning = None self.cassandra_con = None self.dask_client = None return def connect_to_local_dask(self): """ Connects to a local Dask cluster. > connect_to_local_dask() """ self.logger.info("Connecting to Dask") self.logger.info('Create and connect to a local Dask cluster.') dask_cluster = LocalCluster( scheduler_port=0, silence_logs=True, processes=False, asynchronous=False, ) self.dask_client = Client(dask_cluster, asynchronous=False) self.logger.info("Connected to Dask") return def connect_to_dask(self, dask_cluster): """ Connect to a Dask Cluster > connect_to_Dask('127.0.0.1:8786') or > connect_to_Dask(cluser) :param dask_cluster: String with format url:port or an instance of Cluster """ self.logger.info("Connecting to Dask") self.logger.info('Create and connect to a local Dask cluster.') self.dask_client = Client(dask_cluster, asynchronous=False) self.logger.info("Connected to Dask") return def disconnect_from_dask(self): """ Ends the established Dask connection. > disconnect_from_dask() """ self.dask_client.close() return def connect_to_cassandra(self, cassandra_clusters, cassandra_keyspace, username, password): """ Connects to a Cassandra cluster specified by a list of IPs. > connect_to_cassandra('test', ['10.0.1.1', '10.0.1.2']) :param cassandra_keyspace: It is a string which contains an existent Cassandra keyspace. :param cassandra_clusters: It is a list of IPs with each IP represented as a string. :param username: It is a string. :param password: It is a string. """ if cassandra_keyspace == "": raise Exception("Key space can't be an empty string!!!") try: self.cassandra_con = Connector(cassandra_clusters, cassandra_keyspace, username, password) except Exception as e: raise Exception( "It was not possible to set a connection with the Cassandra cluster: " + str(e)) return def disconnect_from_cassandra(self): """ Ends the established Cassandra connection. > disconnect_from_cassandra() """ if self.cassandra_con is not None: self.cassandra_con.shutdown() return def load_cassandra_table(self, table_name, projections, and_predicates, partitions_to_load, force): """ It loads a Cassandra table into a Dask dataframe. > load_cassandra_table('tab1', ['id', 'year', 'month', 'day'], [('month', 'less_than', [1]), ('day', 'in\_', [1,2,3,8,12,30])], [('id', [1, 2, 3, 4, 5, 6]), ('year',[2019])]) :param table_name: It is a String. :param projections: A list of columns names. Each column name is a String. :param and_predicates: List of triples. Each triple contains column name as String, operator name as String, and a list of values depending on the operator. CassandraOperators.print_operators() prints all available operators. It should only contain columns which are not partition columns. :param partitions_to_load: List of tuples. Each tuple as a column name as String. and a list of keys which should be selected. It should only contain columns which are partition columns. :param force: It is a boolean. In case all the partitions need to be loaded, which is not recommended, it should be set to 'True'. """ table = Table(self.cassandra_con.keyspace, table_name) table.load_metadata(self.cassandra_con) if table.error: raise Exception("load_cassandra_table failed: " + table.error) loading_query = LoadingQuery() loading_query.set_projections(table, projections) if loading_query.error: raise Exception("load_cassandra_table failed: " + loading_query.error) loading_query.set_and_predicates(table, and_predicates) if loading_query.error: raise Exception("load_cassandra_table failed: " + loading_query.error) loading_query.partition_elimination(table, partitions_to_load, force) if loading_query.error: raise Exception("load_cassandra_table failed: " + loading_query.error) loading_query.build_query(table) if loading_query.error: raise Exception("load_cassandra_table failed: " + loading_query.error) loading_query.print_query() table.load_data(self.cassandra_con, loading_query) return table
def run_MC_case(n_mc, local=True, dask=True, batch_size=os.cpu_count()): """ Inputs: n_mc: the number of MC samples local: if using Dask, whether to use the local option (True) dask: whether to use dask (True) batch_size: for the non Dask option, number of cases to run in parallel (16) Outputs: results_df: Pandas dataFrame containing inputs to and output from the model my_campaign: EasyVVUQ MC campaign object my_sampler: EasyVVUQ RandomSampler object """ times = np.zeros(9) time_start = time.time() time_start_whole = time_start # Set up a fresh campaign called "fusion_pce." if dask: my_campaign = uq.CampaignDask(name='fusion_mc.') else: my_campaign = uq.Campaign(name='fusion_mc.') # Define parameter space params = define_params() # Create an encoder and decoder for PCE test app encoder = uq.encoders.GenericEncoder(template_fname='fusion.template', delimiter='$', target_filename='fusion_in.json') decoder = uq.decoders.SimpleCSV( target_filename="output.csv", output_columns=["te", "ne", "rho", "rho_norm"]) # Add the app (automatically set as current app) my_campaign.add_app(name="fusion", params=params, encoder=encoder, decoder=decoder) time_end = time.time() times[1] = time_end - time_start print('Time for phase 1 = %.3f' % (times[1])) time_start = time.time() # Create the sampler vary = define_vary() # Associate a sampler with the campaign my_sampler = uq.sampling.RandomSampler(vary=vary, max_num=n_mc) my_campaign.set_sampler(my_sampler) # Will draw all (of the finite set of samples) my_campaign.draw_samples() print('Number of samples = %s' % my_campaign.get_active_sampler().count) time_end = time.time() times[2] = time_end - time_start print('Time for phase 2 = %.3f' % (times[2])) time_start = time.time() # Create and populate the run directories my_campaign.populate_runs_dir() time_end = time.time() times[3] = time_end - time_start print('Time for phase 3 = %.3f' % (times[3])) time_start = time.time() # Run the cases cwd = os.getcwd().replace(' ', '\\ ') # deal with ' ' in the path cmd = f"{cwd}/fusion_model.py fusion_in.json" if dask: if local: print('Running locally') import multiprocessing.popen_spawn_posix # from distributed import Client from dask.distributed import Client, LocalCluster cluster = LocalCluster(threads_per_worker=1) # Client() client = Client(cluster) # processes=True, threads_per_worker=1) else: print('Running using SLURM') from dask.distributed import Client from dask_jobqueue import SLURMCluster cluster = SLURMCluster(job_extra=[ '--qos=p.tok.openmp.2h', '--mail-type=end', '[email protected]', '-t 2:00:00' ], queue='p.tok.openmp', cores=8, memory='8 GB', processes=8) cluster.scale(32) print(cluster) print(cluster.job_script()) client = Client(cluster) print(client) my_campaign.apply_for_each_run_dir( uq.actions.ExecuteLocal(cmd, interpret='python3'), client) client.close() if not local: client.shutdown() else: client.shutdown() else: # in case there is a problem with dask execution = my_campaign.apply_for_each_run_dir( uq.actions.ExecuteLocalV2(cmd, interpret='python3'), batch_size=batch_size) execution.start() while my_campaign.get_active_sampler().count != execution.progress( )['finished']: print(execution.progress()) time.sleep(1) print(execution.progress()) time_end = time.time() times[4] = time_end - time_start print('Time for phase 4 = %.3f' % (times[4])) time_start = time.time() # Collate the results my_campaign.collate() results_df = my_campaign.get_collation_result() return results_df, my_campaign, my_sampler
def main(src_dir): logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install( level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S" ) # assume we have tunnel the scheduler to local scheduler = "localhost:8786" logger.info(f'connecting to scheduler at "{scheduler}"') client = Client(scheduler, timeout="300s") # 5 min print(client) src_dir = os.path.abspath(src_dir) # load dataset src_ds = open_dataset(src_dir) desc = tuple( f"{k}={v}" for k, v in zip(("x", "y", "z"), reversed(src_ds.tile_shape)) ) logger.info(f"tiling dimension ({', '.join(desc)})") # generate tile index list (TODO deal with multi-color/view here) def groupby_tiles(inventory, index: List[str]): """ Aggregation function that generates the proper internal list layout for all the tiles in their natural N-D layout. Args: inventory (pd.DataFrame): the listing inventory index (list of str): the column header """ tiles = [] for _, tile in inventory.groupby(index[0]): if len(index) > 1: # we are not at the fastest dimension yet, decrease 1 level tiles.extend(groupby_tiles(tile, index[1:])) else: # fastest dimension, call retrieval function tiles.append(src_ds[tile]) return tiles index = ["tile_y", "tile_x"] if "tile_z" in src_ds.index.names: index = ["tile_z"] + index logger.info(f"a {len(index)}-D tiled dataset") tiles = groupby_tiles(src_ds, index) logger.info(f"{len(tiles)} to process") tiles_bin4 = [downsample_naive(tile, 4) for tile in tiles] dname = os.path.basename(src_dir) dname = f"{dname}_bin4" dst_dir = os.path.join(os.path.dirname(src_dir), dname) create_dir(dst_dir) # write back write_back_tasks = [] for i, tile in enumerate(tiles_bin4): fname = f"tile_{i:04d}.tif" path = os.path.join(dst_dir, fname) future = write_tiff(path, tile) write_back_tasks.append(future) # submit task futures = client.compute(write_back_tasks, scheduler="processes") with tqdm(total=len(futures)) as pbar: for future in as_completed(futures, with_results=False): try: uri = future.result() uri = os.path.basename(uri) pbar.set_description(uri) pbar.update(1) except Exception as error: logger.exception(error) future.release() logger.info("closing scheduler connection") client.close()
def build_patches(min_lon, min_lat, max_lon, max_lat, begin_iso_dt, end_iso_dt, config_yaml): logging.info("Executing: %s(%s)" % ("build_patches", locals())) # Projection definition for Pyproj (PROJ4 keywords) grid_proj_params = { "proj": "lcc", "lon_0": (min_lon + max_lon) / 2, "lat_0": (min_lat + max_lat) / 2, "lat_1": min_lat, "lat_2": max_lat } print(grid_proj_params) with open(config_yaml, "r") as config_file: config = yaml.load(config_file) glm_config = config["glm"] glm_path = glm_config["glm_path"] grid_path = glm_config["grid_path"] file_freq = glm_config["file_freq"] glm_file_dates = pd.DatetimeIndex( pd.date_range(start=begin_iso_dt, end=end_iso_dt, freq=file_freq)) grid_freq = glm_config["grid_freq"] dx_km = glm_config["dx_km"] x_extent_km = int( math.ceil( haversine((min_lat, min_lon), (min_lat, max_lon), unit=Unit.KILOMETERS))) y_extent_km = int( math.ceil( haversine((min_lat, min_lon), (max_lat, min_lon), unit=Unit.KILOMETERS))) if not os.path.exists(grid_path): os.makedirs(grid_path) if PARALLEL: cluster = LocalCluster(n_workers=N_WORKERS, processes=True, threads_per_worker=THREADS_PER_WORKER, memory_limit=WORKER_MEM) client = Client(cluster) glm_jobs = [] for date in glm_file_dates: logging.info("Processing: %s", date) glm_jobs.append( client.submit(create_glm_grids, glm_path, grid_path, date, min(end_date, date + pd.Timedelta(file_freq)), grid_freq, grid_proj_params, dx_km, x_extent_km, y_extent_km)) for glm_job in as_completed(glm_jobs): res = glm_job.result() if glm_job.status == "error": traceback.format_tb(res[-1]) del glm_jobs[:] else: for date in glm_file_dates: logging.info("Processing: %s", date) create_glm_grids(glm_path, grid_path, date, min(end_iso_dt, date + pd.Timedelta(file_freq)), grid_freq, grid_proj_params, dx_km, x_extent_km, y_extent_km) abi_config = config["abi"] abi_path = abi_config["abi_path"] patch_path = abi_config["patch_path"] glm_grid_path = abi_config["glm_grid_path"] bands = np.array(abi_config["bands"]) file_freq = abi_config["file_freq"] lead_time = abi_config["lead_time"] patch_x_length_pixels = abi_config["patch_x_length_pixels"] patch_y_length_pixels = abi_config["patch_y_length_pixels"] time_range_minutes = abi_config["time_range_minutes"] bt = bool(abi_config["bt"]) if not os.path.exists(patch_path): makedirs(patch_path) abi_file_dates = pd.DatetimeIndex( pd.date_range(start=begin_iso_dt, end=end_iso_dt, freq=file_freq)) if PARALLEL: abi_jobs = [] for date in abi_file_dates: abi_jobs.append( client.submit(extract_all_abi_patches, abi_path, patch_path, glm_grid_path, date, min(end_iso_dt, date + pd.Timedelta(file_freq)), bands, lead_time, patch_x_length_pixels, patch_y_length_pixels, time_range_minutes=time_range_minutes, glm_file_freq=file_freq, bt=bt)) # for abi_job in as_completed(abi_jobs): # res = abi_job.result() # if abi_job.status == "error": # print(traceback.format_tb(res[-1]),flush=True) wait(abi_jobs) abi_results = client.gather(abi_jobs) del abi_jobs[:] client.close() else: for date in abi_file_dates: extract_all_abi_patches(abi_path, patch_path, glm_grid_path, date, min(end_iso_dt, date + pd.Timedelta(file_freq)), bands, lead_time, patch_x_length_pixels, patch_y_length_pixels, time_range_minutes=time_range_minutes, glm_file_freq=file_freq, bt=bt)
def test_pagerank(): gc.collect() input_data_path = r"../datasets/hibench_small/1/part-00000.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) new_ddf = dcg.drop_duplicates(x) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50) wait(pr) t3 = time.time() print("Running PR algo time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) # Use tempfile.mkstemp() to get a temp file name. Close and delete the file # so to_csv() can create it using the unique temp name (tempfileHandle, tempfileName) = tempfile.mkstemp(suffix=".csv", prefix="pagerank_") os.close(tempfileHandle) os.remove(tempfileName) # For bigdatax4, chunksize=100000000 to avoid oom on write csv t6 = time.time() res_df.to_csv(tempfileName, header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df)) client.close() cluster.close() os.remove(tempfileName)
class MGContext: """Utility Context Manager to start a multi GPU context using dask_cuda Parameters: ----------- number_of_devices : int Number of devices to use, verification must be done prior to call to ensure that there are enough devices available. If not specified, the cluster will be initialized to use all visible devices. rmm_managed_memory : bool True to enable managed memory (UVM) in RMM as part of the cluster. Default is False. p2p : bool Initialize UCX endpoints if True. Default is False. """ def __init__(self, number_of_devices=None, rmm_managed_memory=False, p2p=False): self._number_of_devices = number_of_devices self._rmm_managed_memory = rmm_managed_memory self._client = None self._p2p = p2p self._cluster = CUDACluster( n_workers=self._number_of_devices, rmm_managed_memory=self._rmm_managed_memory ) @property def client(self): return self._client @property def cluster(self): return self._cluster def __enter__(self): self._prepare_mg() return self def _prepare_mg(self): self._prepare_client() self._prepare_comms() def _prepare_client(self): self._client = Client(self._cluster) self._client.wait_for_workers(self._number_of_devices) def _prepare_comms(self): Comms.initialize(p2p=self._p2p) def _close(self): Comms.destroy() if self._client is not None: self._client.close() if self._cluster is not None: self._cluster.close() def __exit__(self, type, value, traceback): self._close()
def run_error_estimation_distributed(df1, df2, param): Ncores = envVars.Ncores NWorkers = envVars.NWorkers Ngroups = param.JK_NGROUPS #setup cluster cluster = SGECluster( walltime='172800', processes=1, cores=1, env_extra=[ '#$-pe sge_pe %i' % Ncores, '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch', 'export NUMBA_NUM_THREADS=%i' % Ncores, 'export OMP_NUM_THREADS=%i' % Ncores # 'export OMP_NUM_THREADS=1', # noqa ]) cluster.scale(NWorkers) client = Client(cluster) time.sleep(10) #end setting up cluster #send full dataset to the cluster future_df1 = client.scatter(df1) future_df2 = client.scatter(df2) future_params = client.scatter(param) res_fullDataset_11 = client.submit(cpw.get_cross_pairwise_ksz, future_df1, future_df1, future_params) res_fullDataset_12 = client.submit(cpw.get_cross_pairwise_ksz, future_df1, future_df2, future_params) res_fullDataset_22 = client.submit(cpw.get_cross_pairwise_ksz, future_df2, future_df2, future_params) #done with the full dataset #iterate over partial dataset for the JK replicants1 = [] #data to be sent replicants2 = [] if 'jk' in param.JK_RESAMPLING_METHOD.lower(): all_indx = np.arange(len(df1)) np.random.shuffle(all_indx) indx_to_drop = np.array_split(all_indx, param.JK_NGROUPS) for j in range(Ngroups): # submit data to the cluster if 'jk' in param.JK_RESAMPLING_METHOD.lower(): # if method jk todrop = indx_to_drop[j] replicant1 = df1.drop(df1.index[todrop], inplace=False) replicant2 = df2.drop(df2.index[todrop], inplace=False) replicants1.append(client.scatter(replicant1)) replicants2.append(client.scatter(replicant2)) elif 'bootstrap' in param.JK_RESAMPLING_METHOD.lower(): indxs = np.random.randint(low=0, high=len(df1), size=len(df1)) replicant1 = df1.iloc[indxs] replicant2 = df2.iloc[indxs] replicants1.append(client.scatter(replicant1)) replicants2.append(client.scatter(replicant2)) #Now do the JK calculation realizations11 = [] realizations12 = [] realizations22 = [] for j in range(Ngroups): realizations11.append( client.submit(cpw.get_cross_pairwise_ksz, replicants1[j], replicants1[j], future_params)) realizations12.append( client.submit(cpw.get_cross_pairwise_ksz, replicants1[j], replicants2[j], future_params)) realizations22.append( client.submit(cpw.get_cross_pairwise_ksz, replicants2[j], replicants2[j], future_params)) #extract results fullDataset_result11 = res_fullDataset_11.result() fullDataset_result12 = res_fullDataset_12.result() fullDataset_result22 = res_fullDataset_22.result() resampling_result11 = client.gather(realizations11) resampling_result12 = client.gather(realizations12) resampling_result22 = client.gather(realizations22) client.close() # cluster.close() results = { 'full11': fullDataset_result11, 'full12': fullDataset_result12, 'full22': fullDataset_result22, 'resampled11': resampling_result11, 'resampled12': resampling_result12, 'resampled22': resampling_result22 } return results
class GraphData: def __init__(self, graph_num_bars, dask_address): self.dask_client = Client(address = dask_address) self.currentValue = {'Memory' :{'total_memory':0, 'used_memory':0}, 'CPU' :{'cpu_usage':0}, 'Cluster':{'n_workers':0, 'total_threads':0}, 'Workers':[]} self.update_dask_values() # Constants data self.mem_max_value = self.currentValue['Memory']['total_memory'] self.util_max_value = 100 self.graph_num_bars = graph_num_bars # Data for graphs self.cpu_util = [0] * graph_num_bars self.mem_util = [0] * graph_num_bars # Data for statistics self.n_workers = self.num_workers() self.total_mem = self.currentValue['Memory']['total_memory'] self.used_mem = self.currentValue['Memory']['used_memory'] def close_con(self): self.dask_client.close() def update_all(self): self.update_dask_values() self.n_workers = self.num_workers() self.mem_max_value = self.currentValue['Memory']['total_memory'] self.total_mem = self.currentValue['Memory']['total_memory'] self.used_mem = self.currentValue['Memory']['used_memory'] self.cpu_util = self.update_graph_val(self.cpu_util, self.cpu_usage()) self.mem_util = self.update_graph_val(self.mem_util, self.used_mem) def reset(self): self.cpu_util = [0] * self.graph_num_bars self.mem_util = [0] * self.graph_num_bars self.mem_max_value = 0 self.total_mem = 0 self.used_mem = 0 def update_graph_val(self, values, new_val): values_num = len(values) if values_num > self.graph_num_bars: values = values[values_num - self.graph_num_bars - 1:] elif values_num < self.graph_num_bars: zero_pad = [0] * (self.graph_num_bars - values_num) values = zero_pad + values values.append(new_val) return values[1:] def update_dask_values(self): self.worker_info = self.dask_client.scheduler_info()['workers'] self.currentValue['Memory']['total_memory'] = round(self.available_memory() / (1024**2),2) self.currentValue['Memory']['used_memory'] = round(self.used_memory() / (1024**2),2) self.currentValue['Memory']['used_memory_percent'] = self.currentValue['Memory']['used_memory'] / self.currentValue['Memory']['total_memory'] self.currentValue['CPU']['cpu_usage'] = self.cpu_usage() self.currentValue['Cluster']['n_workers'] = self.num_workers() self.currentValue['Cluster']['total_threads'] = self.num_workers() self.currentValue['Workers'] = self.get_worker_stats() def num_workers(self): return len(self.worker_info) def num_threads(self): threads = [worker['nthreads'] for _, worker in self.worker_info.items()] return(sum(threads)) def available_memory(self): tots = 0 for w, info in self.worker_info.items(): tots += info['memory_limit'] return tots def used_memory(self): tots = 0 for w, info in self.worker_info.items(): tots += info['metrics']['memory'] return tots def get_worker_stats(self): worker_stats=[] for w, info in self.worker_info.items(): stats = {'user':'******', 'id' : 'filler', 'name' : 'filler', 'rawtime':1, 'time':1, 'command':'', 'cpu':1, 'memory':1, 'local_ports':'filler'} stats['address'] = w stats['nthreads'] = info['nthreads'] stats['memory'] = round(info['metrics']['memory'] / (1024**2),2) stats['memory_limit'] = round(info['memory_limit'] / (1024**2), 2) stats['cpu'] = info['metrics']['cpu'] stats['read'] = round(info['metrics']['read_bytes'] / (1024**2), 2) stats['write'] = round(info['metrics']['write_bytes'] / (1024**2), 2) worker_stats.append(stats) return worker_stats def cpu_usage(self): """ Average cpu utilization across all workers """ usages = [] for w, info in self.worker_info.items(): usages.append(info['metrics']['cpu']) if len(usages)>0: return sum(usages) / len(usages) else: return 0
def train_eval_denoisers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'): job_name = 'denoising_fastmri' model_specs = list(get_model_specs(force_res=True)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [ client.submit( # function to execute train_denoiser, model=(model_fun, kwargs, n_inputs), run_id=f'{model_name}_{model_size}_{int(time.time())}', contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, loss=loss, ) for model_name, model_size, model_fun, kwargs, n_inputs, _, _ in model_specs ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_denoisers( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, model_name=model_name, model_size=model_size, n_samples_train=n_samples, loss=loss, ) return run_ids
def test_status(): with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) assert c.scheduler_status() == 'OK' c.close()
class Ingest: """ Ingest class Handles running the ingestion pipeline """ def __init__(self, scheduler_address, use_semantic_detection=False, client=None, tmp_dir=None, use_xgboost_postprocess=False, use_rules_postprocess=False): """ :param scheduler_address: Address to existing Dask scheduler :param use_semantic_detection: Whether or not to run semantic detection :param client: A Dask client. Can be passed in. If None, one will be created to connect to the scheduler :param tmp_dir: Path to temporary directory which intermediate files and images will be written :param use_xgboost_postprocess: Whether to use the XGBoost postprocessing model :param use_rules_postprocess: Whether to utilize the rules postprocessing, which is specific to scientific docs """ logger.info("Initializing Ingest object") self.client = client if self.client is None: logger.info("Setting up client") self.client = Client(scheduler_address, serializers=['msgpack', 'dask'], deserializers=['msgpack', 'dask', 'pickle']) logger.info(self.client) self.use_xgboost_postprocess = use_xgboost_postprocess self.use_rules_postprocess = use_rules_postprocess self.use_semantic_detection = use_semantic_detection self.tmp_dir = tmp_dir if tmp_dir is None: raise ValueError("tmp_dir must be passed in") # Create a subdirectory for tmp files self.images_tmp = os.path.join(self.tmp_dir, 'images') os.makedirs(self.images_tmp, exist_ok=True) def __del__(self): """Simple client cleanup""" if self.client is not None: self.client.close() def ingest(self, pdf_directory, dataset_id, result_path, images_pth, skip_ocr=True, visualize_proposals=False, aggregations=[], batch_size=2000, compute_word_vecs=False, ngram=1): """ Handler for ingestion pipeline. Given a directory of PDFs, run the cosmos ingestion pipeline. This will identifies page objects, and optionally perform aggregations over objects (eg associating tables with table captions in scientific document pipelines) By default, a single parquet file will be written, containing each identified page object and its text. If additional aggregations are defined, a parquet file will be written for each defined aggregation. For additional information on the aggregations and schemas for the output files, see the documentation. :param pdf_directory: path to a directory of PDFs to process :param dataset_id: The dataset id for this PDF set :param result_path: Path to output directory where parquets and additional images will be written :param images_pth: Path to where images can be written to (tmp, not output images directory) :param skip_ocr: If True, PDFs with no metadata associated will be skipped. If False, OCR will be performed :param visualize_proposals: Debugging option, will write images with bounding boxes from proposals to tmp :param aggregations: List of aggregations to run over resulting objects :param compute_word_vecs: Whether to compute word vectors over the corpus :param ngram: n in ngram for word vecs """ os.makedirs(images_pth, exist_ok=True) pdfnames = get_pdf_names(pdf_directory) pdf_to_images = functools.partial(Ingest.pdf_to_images, dataset_id, self.images_tmp) logger.info('Starting ingestion. Converting PDFs to images.') images = [ self.client.submit(pdf_to_images, pdf, resources={'process': 1}) for pdf in pdfnames ] class TimeOutError(Exception): pass def raise_timeout(var1, var2): raise TimeOutError signal.signal(signal.SIGALRM, raise_timeout) try: for _ in as_completed(images): signal.alarm(0) signal.alarm(180) except TimeOutError: images = [i for i in images if i.status == 'finished'] pass else: signal.alarm(0) logger.info( 'Done converting to images. Starting detection and text extraction' ) images = [i.result() for i in images] images = [i for i in images if i is not None] images_queue = [i for il in images for i in il] images = [] iterator = iter(images_queue) while chunk := list(islice(iterator, batch_size)): partial_propose = functools.partial(propose_and_pad, visualize=visualize_proposals) chunk = self.client.map(partial_propose, chunk, resources={'process': 1}, priority=8) if self.use_semantic_detection: chunk = self.client.map(detect, chunk, resources={'GPU': 1}, priority=8) chunk = self.client.map(regroup, chunk, resources={'process': 1}) pool_text_ocr_opt = functools.partial(pool_text, skip_ocr=skip_ocr) chunk = self.client.map(pool_text_ocr_opt, chunk, resources={'process': 1}) if self.use_xgboost_postprocess: chunk = self.client.map(xgboost_postprocess, chunk, resources={'process': 1}) chunk = [i for i in chunk if i.result() != ''] if self.use_rules_postprocess: chunk = self.client.map(rules_postprocess, chunk, resources={'process': 1}) progress(chunk) images.extend([i.result() for i in chunk]) results = [] for i in images: with open(i, 'rb') as rf: obj = pickle.load(rf) for ind, c in enumerate(obj['content']): bb, cls, text = c scores, classes = zip(*cls) scores = list(scores) classes = list(classes) postprocess_cls = postprocess_score = None if 'xgboost_content' in obj: _, postprocess_cls, _, postprocess_score = obj[ 'xgboost_content'][ind] final_obj = { 'pdf_name': obj['pdf_name'], 'dataset_id': obj['dataset_id'], 'page_num': obj['page_num'], 'img_pth': obj['pad_img'], 'pdf_dims': list(obj['pdf_limit']), 'bounding_box': list(bb), 'classes': classes, 'scores': scores, 'content': text, 'postprocess_cls': postprocess_cls, 'postprocess_score': postprocess_score } results.append(final_obj) if len(results) == 0: logger.info('No objects found') return result_df = pd.DataFrame(results) result_df['detect_cls'] = result_df['classes'].apply(lambda x: x[0]) result_df['detect_score'] = result_df['scores'].apply(lambda x: x[0]) for aggregation in aggregations: aggregate_df = aggregate_router(result_df, aggregate_type=aggregation, write_images_pth=images_pth) name = f'{dataset_id}_{aggregation}.parquet' aggregate_df.to_parquet(os.path.join(result_path, name), engine='pyarrow', compression='gzip') if compute_word_vecs: make_vecs(result_df, ngram) result_df.to_parquet(os.path.join(result_path, f'{dataset_id}.parquet'), engine='pyarrow', compression='gzip')
def main(): # dask cluster and client n_processes = 1 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime=walltime, memory=f"32 G", resource_spec=f"h_vmem=32G", scheduler_options={ "dashboard_address": ":5761", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=32G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-find-emis-pm-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # dask bag over emission_configs print( f"predicting over {len(emission_configs)} emission configs for {station_id} ..." ) bag_emission_configs = db.from_sequence(emission_configs, npartitions=n_workers) results = bag_emission_configs.map(filter_emission_configs).compute() station_diffs_abs = [result[0] for result in results] station_diffs_per = [result[1] for result in results] key = [key for key in baselines.keys()][0] station_diffs_abs = [ station_diff_abs for station_diff_abs in station_diffs_abs if len(station_diff_abs[key]) > 0 ] station_diffs_per = [ station_diff_per for station_diff_per in station_diffs_per if len(station_diff_per[key]) > 0 ] merged_per = {} for station_diff_per in station_diffs_per: merged_per = {**merged_per, **station_diff_per[key]} merged_abs = {} for station_diff_abs in station_diffs_abs: merged_abs = {**merged_abs, **station_diff_abs[key]} station_diffs_per = {key: merged_per} station_diffs_abs = {key: merged_abs} joblib.dump( obs_change_abs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_abs_{output}_{station_id}.joblib" ) joblib.dump( obs_change_per, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_per_{output}_{station_id}.joblib" ) joblib.dump( baselines, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/baselines_{output}_{station_id}.joblib" ) joblib.dump( targets, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/targets_{output}_{station_id}.joblib" ) joblib.dump( target_diffs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/target_diffs_{output}_{station_id}.joblib" ) joblib.dump( station_diffs_abs, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_abs_{output}_{station_id}.joblib" ) joblib.dump( station_diffs_per, f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_per_{output}_{station_id}.joblib" ) time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) client.close() cluster.close()
class BetflagScraper(SiteScraper): def __init__(self, sport='calcio', bet_type='1x2', max_additional_data=-1, cluster=None, offline=True, live=True, headless=True): if max_additional_data == -1: max_additional_data = math.inf self.max_additional_data = max_additional_data self.n_additional_data_loaded = [ max_additional_data, max_additional_data ] self.sport = 'calcio' self.bet_type = '1x2' self.url = 'https://www.betflag.it/exchange' self.refresh_period = 600 # seconds self.n_driver = 1 self.offline = offline if not self.offline: self.live = True else: self.live = live self.headless = headless options = Options() if self.headless: options.headless = True self.driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options) if self.offline: self.driver.execute_script(f"window.name = 'offline';") if self.live: self.driver.execute_script( f"window.open('{self.url}', 'live');") else: self.driver.execute_script(f"window.name = 'live';") # Setup the driver self.setup_drivers(sport=sport, bet_type=bet_type) self.last_refresh = datetime.datetime.now() # Login time.sleep(1) # self.driver.find_element_by_xpath('//*[@id="btnLoginModal"]').click() # time.sleep(0.75) # self.driver.find_element_by_xpath('//*[@id="LoginUsername"]').send_keys('username') # self.driver.find_element_by_xpath('//*[@id="LoginPassword"]').send_keys('password') # self.driver.find_element_by_xpath('//*[@id="BtnLoginNew2"]').click() if cluster is not None: self.client = Client(cluster) else: self.client = Client(processes=False) def setup_drivers(self, sport, bet_type): for window in self.driver.window_handles: self.driver.switch_to.window(window) time.sleep(0.05) self.driver.get(self.url) # Close +18 warning try: WebDriverWait(self.driver, timeout=15).until( expected_conditions.element_to_be_clickable( (By.XPATH, '//*[@id="Button1"]'))) time.sleep(0.5) self.driver.find_element_by_xpath('//*[@id="Button1"]').click() except TimeoutException: pass except ElementClickInterceptedException: pass try: WebDriverWait(self.driver, timeout=15).until( expected_conditions.element_to_be_clickable( (By.XPATH, '//*[@id="Button3"]'))) time.sleep(0.5) self.driver.find_element_by_xpath('//*[@id="Button3"]').click() except TimeoutException: pass except ElementClickInterceptedException: pass # Close promo-login try: time.sleep(2) WebDriverWait(self.driver, timeout=25).until( expected_conditions.element_to_be_clickable( (By.XPATH, '//*[@id="PromoPopup"]/div[2]/div[2]'))) self.driver.find_element_by_xpath( '//*[@id="PromoPopup"]/div[2]/div[2]').click() except TimeoutException: pass time.sleep(1) self.set_live() # Set sport self.set_sport(sport) # Load additional data self.load_additional_data() # Set bet_type self.set_bet_type(bet_type) def set_sport(self, sport): # Check if actual sport is equal to the new sport if self.sport == sport: return flag = True for window in self.driver.window_handles: self.driver.switch_to.window(window) time.sleep(0.05) # Find the sport button then click it for el in self.driver.find_elements_by_xpath( '//*[@id="MenuScroller"]/ul/li'): if el.text.lower() == sport: el.click() flag = False break if flag: # If it isn't present close the driver and remove it from the drivers list print(f'Sport: {sport} is not available!') # self.driver.close() flag = True self.sport = sport def set_bet_type(self, bet_type): # Check if actual bet_type is equal to the new bet_type if self.bet_type == bet_type or self.sport == 'calcio': self.bet_type = bet_type return if self.bet_type != '1x2': is_uo = self.bet_type[:2] == 'uo' if is_uo: n_goal = self.bet_type[-3:] for window in self.driver.window_handles: self.driver.switch_to.window(window) time.sleep(0.05) self.driver.find_element_by_xpath( f"//a[contains(text(), 'Under And Over {n_goal}')]" ).find_element_by_xpath('..').click() else: if bet_type == '12': self.bet_type = bet_type def set_live(self): # Click on last minute on the first driver if self.offline: self.driver.switch_to.window('offline') time.sleep(0.05) try: WebDriverWait(self.driver, timeout=15).until( expected_conditions.element_to_be_clickable(( By.XPATH, '/html/body/form/section/div[9]/div[3]/nav/div[2]/ul/li[1]/button' ))).click() except TimeoutException: pass # Click on live on the second driver if self.live: self.driver.switch_to.window('live') time.sleep(0.05) try: WebDriverWait(self.driver, timeout=15).until( expected_conditions.element_to_be_clickable( (By.XPATH, '//*[@id="livenowbutton"]/button'))).click() except TimeoutException: pass def load_additional_data(self): # Add data self.n_additional_data_loaded = [] time.sleep(2.5) for window in self.driver.window_handles: self.driver.switch_to.window(window) time.sleep(0.05) i = 0 while i < self.max_additional_data: try: self.driver.find_element_by_class_name('addEvents').click() time.sleep(0.25) i += 1 except NoSuchElementException: break self.n_additional_data_loaded.append(i) # Scroll to the top of the page self.driver.execute_script(f"window.scrollTo(0,0)") # Wait until they are loaded self.scroll_page() def scroll_page(self, wait_period=0.75, jump=500, jump_per_additional_content=4): for window, n_additional_data_loaded in zip( self.driver.window_handles, self.n_additional_data_loaded): self.driver.switch_to.window(window) time.sleep(0.05) if n_additional_data_loaded > 10: _wait_period = wait_period / math.log(n_additional_data_loaded) else: _wait_period = wait_period for i in range( (n_additional_data_loaded + 1) * jump_per_additional_content): self.driver.execute_script( f"window.scrollTo(0,{int(jump * i)})") time.sleep(_wait_period) def refresh_pages(self): self.setup_drivers(self.sport, self.bet_type) self.last_refresh = datetime.datetime.now() def get_data(self): data = {} # Refresh the page every self.refresh_period seconds if self.last_refresh + datetime.timedelta( seconds=self.refresh_period) < datetime.datetime.now(): self.refresh_pages() else: # Otherwise scroll the page in order to be sure that all the bets are loaded # self.scroll_page(0.01, jump=1000, jump_per_additional_content=1) pass futures = [] data = {} for window in self.driver.window_handles: # Extract HTML code self.driver.switch_to.window(window) time.sleep(0.05) content_html = self.driver.find_element_by_class_name( 'containerEvents').get_attribute('innerHTML') soup = BeautifulSoup(content_html, 'html.parser') # Scrape the data divs = soup.find_all('div', 'row-e') futures = [ self.client.submit(parse_row, str(div), self.bet_type) for div in divs ] results = [future.result() for future in futures] data.update({key: value for key, value in results if value}) return data def bet(self): pass def close(self, close_client=True): self.driver.quit() if close_client: self.client.close()
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] if args.sched_addr: client = Client(args.sched_addr) else: filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) if args.type == "gpu": client.run( setup_memory_pool, pool_size=args.rmm_pool_size, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) client.wait_for_workers(n_workers) # Allow the number of chunks to vary between # the "base" and "other" DataFrames args.base_chunks = args.base_chunks or n_workers args.other_chunks = args.other_chunks or n_workers if args.all_to_all: all_to_all(client) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile)) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} broadcast = (False if args.shuffle_join else (True if args.broadcast_join else "default")) t_runs = numpy.empty(len(took_list)) if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"merge type | {args.type}") print(f"rows-per-chunk | {args.chunk_size}") print(f"base-chunks | {args.base_chunks}") print(f"other-chunks | {args.other_chunks}") print(f"broadcast | {broadcast}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.disable_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for idx, (data_processed, took) in enumerate(took_list): throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") t_runs[idx] = float(format_bytes(throughput).split(" ")[0]) print("===============================") if args.markdown: print("\n```") if args.plot is not None: plot_benchmark(t_runs, args.plot, historical=True) if args.backend == "dask": if args.markdown: print( "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```" ) print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.multi_node: client.shutdown() client.close()
def tsmask_one_iteration(ncpu, mem, block, crs, out_crs, start_of_epoch, end_of_epoch, dirc, loc_str): [y1, y2, x1, x2] = block #Datacube object dc = datacube.Datacube(app='load_clearsentinel') tg_ds = tsf.load_s2_nbart_dask(dc, y1, y2, x1, x2, start_of_epoch, end_of_epoch, { "time": 1, }, crs, out_crs) memstr = str(mem) + 'GB' client = Client(n_workers=ncpu, threads_per_worker=2, memory_limit=memstr) client.compute(tg_ds) client.close() irow = tg_ds['y'].size icol = tg_ds['x'].size tn = tg_ds['time'].size print(tn, irow, icol) # Create numpy array to store TSmask results tsmask = np.zeros((tn, irow, icol), dtype=np.uint8) print("Time series cloud and shadow detection for area (", y1, y2, x1, x2, ")") # Run time series cloud mask algorithm on the data tsmask = tsmask_filter_onearea(tg_ds, ncpu, tsmask) print("Begin applying spatial filter") results = [] # number of process for the pool object number_of_workers = ncpu # Create a Pool object with a number of processes p = Pool(number_of_workers) # create a list of scene paralist = [tsmask[i, :, :] for i in range(tn)] # Start runing the spatial filter function using a pool of indepedent processes results = p.map(cym.spatial_filter_v2, paralist) # Finish the parallel runs p.close() # Join the results and put them back in the correct order p.join() # Save the cloud/shadow masks to the 'tsmask' dataarray in the s2_ds dataset for i in range(tn): tsmask[i, :, :] = results[i] print("Begin calculting long term of the indice set") bgids = bg_indices_one_iteration(ncpu, tg_ds, dirc, loc_str, tsmask, start_of_epoch, end_of_epoch) print(bgids.shape) # print("Begin creating input features for Nmask ANN model") # create_ip_data(tg_ds, bgids, loc_str, dirc) tg_ds.close()
def main(sample_run=True, sens_run=False, cfg_file=None, year_s=1979, year_e=2018): """Run the STJ Metric given a configuration file.""" # Generate an STJProperties, allows easy access to these properties across methods. if sample_run: # ----------Sample test case------------- jf_run = JetFindRun('{}/stj_config_sample.yml'.format(CFG_DIR)) date_s = dt.datetime(2016, 1, 1) date_e = dt.datetime(2016, 1, 3) elif cfg_file is not None: print(f'Run with {cfg_file}') jf_run = JetFindRun(cfg_file) else: # ----------Other cases------------- # jf_run = JetFindRun('{}/stj_kp_erai_daily.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_merra_daily.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_ncep_monthly.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_jra55_theta_mon.yml'.format(CFG_DIR)) # Four main choices jf_run = JetFindRun('{}/stj_config_erai_theta.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_erai_theta_daily.yml'.format(CFG_DIR)) # jf_run = JetFindRun( # '{}/stj_config_erai_monthly_davisbirner_gv.yml'.format(CFG_DIR) # ) # jf_run = JetFindRun('{}/stj_config_cfsr_mon.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_cfsr_day.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_jra55_mon.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_jra55_day.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_merra_monthly.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_merra_daily.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_jra55_monthly_cades.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_cfsr_monthly.yml'.format(CFG_DIR)) # jf_run = JetFindRun('{}/stj_config_jra55_daily_titan.yml'.format(CFG_DIR)) # ---------U-Max---------- # jf_run = JetFindRun('{}/stj_umax_erai_pres.yml'.format(CFG_DIR)) # ---Davis-Birner (2016)-- # jf_run = JetFindRun('{}/stj_config_erai_monthly_davisbirner_gv.yml' # .format(CFG_DIR)) if not sample_run: date_s = dt.datetime(year_s, 1, 1) date_e = dt.datetime(year_e, 12, 31) cpus = multiprocessing.cpu_count() if cpus % 4 == 0: _threads = 4 elif cpus % 3 == 0: _threads = 3 else: _threads = 2 cluster = LocalCluster(n_workers=cpus // _threads, threads_per_worker=_threads) client = Client(cluster) jf_run.log.info(client) if sens_run: sens_param_vals = { 'pv_value': np.arange(1.0, 4.5, 0.5), 'fit_deg': np.arange(3, 9), 'min_lat': np.arange(2.5, 15, 2.5), 'max_lat': np.arange(60., 95., 5.) } for sens_param in sens_param_vals: jf_run.run_sensitivity(sens_param=sens_param, sens_range=sens_param_vals[sens_param], date_s=date_s, date_e=date_e) else: jf_run.run(date_s, date_e) client.close() jf_run.log.info('JET FINDING COMPLETE')
def test_dask_serial_gridder(show_plots=False): """ Unit test that compares the standard gridder in CNGI with that in CASA. For the test to run sis14_twhya_field5_mstrans_lsrk_old.zarr and sis14_twhya_field5_mstrans_lsrk_ximage.zarr are required (placed in cngi_prototype/cngi/data/). Parameters ---------- show_plots : bool If true plots are shown of CASA and CNGI gridded visibilities and dirty images. Returns ------- pass_test : bool Is True if CNGI values are close enough to CASA values. """ import xarray as xr import cngi import os import tests.gridding_convolutional_kernels as gck from cngi.gridding import serial_grid_dask_sparse import matplotlib.pylab as plt import numpy as np from dask.distributed import Client import dask.array as da from scipy.fftpack import fft2, ifft2, fftshift, ifftshift import time cngi_path = os.path.dirname(cngi.__file__) cngi_prototype_path = cngi_path[:cngi_path.rfind('/')] # Load measurement dataset outfile = cngi_prototype_path + '/data/sis14_twhya_field5_mstrans_lsrk_old.zarr/0' vis_dataset = xr.open_zarr(outfile) # Gridding Parameters dtr = np.pi / (3600 * 180) # Degrees to Radians n_xy = np.array( [200, 400] ) # 2048 # 2 element array containing number of pixels for x (u and l) and y (v and m) axes. padding = 1.2 # Padding factor n_xy_padded = (padding * n_xy).astype( int) # 2 element array containg number of pixels with padding delta_lm = np.array( [-0.08 * dtr, 0.08 * dtr] ) # Image domain cell size converted from arcseconds to radians (negative is due to axis direction being different from indx increments) delta_uv = np.array([ 1 / (n_xy_padded[0] * delta_lm[0]), 1 / (n_xy_padded[1] * delta_lm[1]) ]) # Visibility domain cell size # Creating gridding kernel (same as CASA) support = 7 # The support in CASA is defined as the half support, which is 3 oversampling = 100 cgk_1D = gck.create_prolate_spheroidal_kernel_1D(oversampling, support) cgk, cgk_image = gck.create_prolate_spheroidal_kernel( oversampling, support, n_xy_padded) # Data used for gridding grid_data = vis_dataset.data_vars['DATA'] uvw = vis_dataset.data_vars['UVW'] freq_chan = vis_dataset.coords['chan'].values weight = vis_dataset.data_vars['WEIGHT'] weight_avg = xr.DataArray.expand_dims( (weight[:, :, 0] + weight[:, :, 1]) / 2, dim=['n_pol'], axis=2 ) # Casa averages the weights for different polarizations. See code/msvis/MSVis/VisImagingWeight.cc VisImagingWeight::unPolChanWeight # The polarization dimention is again added. flag_row = vis_dataset.data_vars['FLAG_ROW'] flag = vis_dataset.data_vars['FLAG'] n_uv = n_xy_padded # n_imag_chan = 1 # Making only one continuum image. n_imag_pol = 1 # Imaging only one polarization sum_weight = np.zeros((n_imag_chan, n_imag_pol), dtype=np.double) n_chan = vis_dataset.dims['chan'] n_pol = 1 # vis_dataset.dims['pol'] #just using the first polarization vis_grid = np.zeros( (n_imag_chan, n_imag_pol, n_xy_padded[0], n_xy_padded[1]), dtype=np.complex128) # Creating an empty grid chan_map = (np.zeros(n_chan)).astype( np.int ) # np.arange(0,n_chan) for cube, n_chan array which maps to the number of channels in image (cube) pol_map = (np.zeros(n_pol)).astype(np.int) grid_parms = {} grid_parms['n_imag_chan'] = n_imag_chan grid_parms['n_imag_pol'] = n_imag_pol grid_parms['n_uv'] = n_uv grid_parms['delta_lm'] = delta_lm grid_parms['oversampling'] = oversampling grid_parms['support'] = support n_workers = 2 threads_per_worker = 1 memory_limit = '4GB' client = Client(n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit=memory_limit) print(client) start = time.time() grids_and_sum_weights = da.blockwise( serial_grid_dask_sparse, ("n_time", "n_switch", "n_imag_chan", "n_imag_pol", "n_u", "n_v"), grid_data, ("n_time", "n_baseline", "n_chan", "n_pol"), uvw, ("n_time", "n_baseline", "uvw"), weight_avg, ("n_time", "n_baseline", "n_pol"), flag_row, ("n_time", "n_baseline"), flag, ("n_time", "n_baseline", "n_vis_chan", "n_vis_pol"), new_axes={ "n_switch": 2, "n_imag_chan": n_imag_chan, "n_imag_pol": n_imag_pol, "n_u": n_xy_padded[0], "n_v": n_xy_padded[1] }, adjust_chunks={"n_time": 1}, freq_chan=freq_chan, chan_map=chan_map, pol_map=pol_map, cgk_1D=cgk_1D, grid_parms=grid_parms, dtype=complex) grid_and_sum_weight = (grids_and_sum_weights.sum(axis=0)) # dask_grid.visualize(filename='dask_grid.svg') grid_and_sum_weight = grid_and_sum_weight.compute() vis_grid = grid_and_sum_weight[0, :, :, :, :].todense() sum_weight = grid_and_sum_weight[1, 0, 0, 0, 0] print('Gridding Time Dask (s): ', time.time() - start) client.close() # Create Dirty Image and correct for gridding convolutional kernel uncorrected_dirty_image = fftshift(ifft2(ifftshift( vis_grid[0, 0, :, :]))) * ( (n_xy_padded[0] * n_xy_padded[1]) / np.sum(sum_weight)) corrected_dirty_image = np.real(uncorrected_dirty_image / cgk_image) # Remove Padding start_xy = (n_xy_padded // 2 - n_xy // 2) end_xy = start_xy + n_xy corrected_dirty_image = corrected_dirty_image[start_xy[0]:end_xy[0], start_xy[1]:end_xy[1]] # Normalize results # corrected_dirty_image = corrected_dirty_image/(np.max(np.abs(corrected_dirty_image))) # Load CASA data outfile = cngi_prototype_path + '/data/sis14_twhya_field5_mstrans_lsrk_ximage.zarr' ximage_dataset = xr.open_zarr(outfile) casa_corrected_dirty_image = ximage_dataset['residual'].values[:, :, 0, 0] if show_plots == True: fig0, ax0 = plt.subplots(1, 2, sharey=True) im0 = ax0[0].imshow(casa_corrected_dirty_image) im1 = ax0[1].imshow(corrected_dirty_image) ax0[0].title.set_text('CASA Dirty Image') ax0[1].title.set_text('CNGI Dirty Image') fig0.colorbar(im0, ax=ax0[0], fraction=0.046, pad=0.04) fig0.colorbar(im1, ax=ax0[1], fraction=0.046, pad=0.04) plt.show() plt.figure() plt.imshow(casa_corrected_dirty_image - corrected_dirty_image) plt.title('Difference Dirty Image') plt.colorbar() plt.show() corrected_dirty_image = corrected_dirty_image / np.max( np.abs(corrected_dirty_image)) casa_corrected_dirty_image = casa_corrected_dirty_image / np.max( np.abs(casa_corrected_dirty_image)) # Calculate max error max_error_corrected_dirty_image = np.max( np.abs(corrected_dirty_image - casa_corrected_dirty_image)) # Calculate root mean square error rms_error_corrected_dirty_image = np.linalg.norm( corrected_dirty_image - casa_corrected_dirty_image, 'fro') pass_test = False print( '*******************************************************************************' ) print( 'Gridded and image values have been normalized before calculating error values' ) print('Max error between CASA and CNGI dirty images ', max_error_corrected_dirty_image) print('RMS error between CASA and CNGI dirty images ', rms_error_corrected_dirty_image) if (max_error_corrected_dirty_image < 1.2886e-07) and (rms_error_corrected_dirty_image < 1.616e-06): print('Test Pass') pass_test = True else: print('Test Fail') pass_test = False print( '*******************************************************************************' ) return pass_test
def launch_python_post(): curDir = os.path.dirname(os.path.abspath(__file__)) logger = PyPostTools.pyPostLogger() logger.write("Initializing WRF Python Post-Processing Program") #Step 1: Load program settings logger.write(" 1. Application Initalization") logger.write(" - Loading control file, python_post_control.txt") _pySet = PyPostSettings.PyPostSettings() logger.write(" - Success!") logger.write(" - Testing Environmental Variables") try: dask_nodes = os.environ["PYTHON_POST_NODES"] dask_threads = os.environ["PYTHON_POST_THREADS"] postDir = os.environ["PYTHON_POST_DIR"] targetDir = os.environ["PYTHON_POST_TARG_DIR"] except KeyError: logger.write("***FAIL*** KeyError encountered while trying to access important environmental variables, abort.") sys.exit("") logger.write(" - Success!") logger.write(" - Initializing Dask (" + str(dask_nodes) + " Nodes Requested), Collecting routines needed") _routines = Routines.Routines() logger.write(" - Async IO Loop initialized...") def f(scheduler_port): async def g(port): s = Scheduler(port=scheduler_port) await s await s.finished() asyncio.get_event_loop().run_until_complete(g(scheduler_port)) # Starts the scheduler in its own process - needed as otherwise it will # occupy the program and make it do an infinite loop process = Process(target=f, args=(scheduler_port,)) process.start() logger.write(" - Dask Scheduler initialized (Port " + str(scheduler_port) + ")...") try: dask_client = Client("tcp://" + socket.gethostname() + ":" + str(scheduler_port), timeout=30) except OSError: logger.write(" <-> Dask Client could not be created, timeout error.") process.terminate() sys.exit() logger.write(" - Dask Client initialized...") logger.write(" - Writing Dask Worker Job Files...") with PyPostTools.cd(targetDir): writeFile = PyPostTools.write_job_file(socket.gethostname(), scheduler_port, project_name="climate_severe", queue="debug-cache-quad", nodes=dask_nodes, wall_time=60, nProcs=1) if(writeFile == False): dask_client.close() logger.write(" - Failed to write job file, are you missing an important parameter?") sys.exit("") return else: logger.write(" - Dask Worker Job File Written, Submitting to Queue.") PyPostTools.popen("chmod +x dask-worker.job") PyPostTools.popen("qsub dask-worker.job") # Wait here for workers. logger.write(" -> Worker Job submitted to queue, waiting for workers...") while len(dask_client.scheduler_info()['workers']) < int(dask_nodes): time.sleep(2) logger.write(" -> Workers are now connected.") logger.write(" - Success!") logger.write(" 1. Done.") logger.write(" 2. Start Post-Processing Calculations") start_calculations(dask_client, _routines, dask_threads, process) logger.write(" 2. Done.") logger.write(" 3. Generating Figures") logger.write(" - Collecting files from target directory (" + targetDir + ").") fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*")) logger.write(" - " + str(len(fList3)) + " files have been found.") logger.write(" -> Pushing run_plotting_routines() to dask.") fullDict = _pySet.get_full_dict() plotting_future = start_plotting(dask_client, fullDict, dask_threads, process) wait(plotting_future) result_plot = dask_client.gather(plotting_future)[0] if(result_plot != 0): logger.write("***FAIL*** An error occured in plotting method, check worker logs for more info.") logger.close() sys.exit("") logger.write(" 3. Done.") logger.write(" 4. Final Steps") logger.write(" 4. Done, Closing Dask Client.") # Close the client object dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'], close=True) dask_client.close() logger.write("All Steps Completed.") logger.write("***SUCCESS*** Program execution complete.") logger.close() del dask_client process.terminate()
memory='15GB' from dask_jobqueue import PBSCluster from dask.distributed import Client import dask.dataframe as dd cluster = PBSCluster(cores=1, memory=memory, project='PerfTestPangeo', walltime='04:00:00') cluster.scale(ask_workers) c = Client(cluster) c from dask.utils import ensure_dict, format_bytes wk = c.scheduler_info()["workers"] text="Workers= " + str(len(wk)) memory = [w["memory_limit"] for w in wk.values()] if all(memory): text += ", Memory=" + format_bytes(sum(memory)) print(text) #Workers= 2, 2 cores, Memory=30.00 GB %time ds=xr.open_zarr('/work/ALT/odatis/eNATL60/zarr/eNATL60-BLBT02-SSH-1h') #87.9ms %time mean=ds.sossheig.mean(dim='time_counter') #310ms %time mean.load() #48min13 c.close() cluster.close()
def combine_probes(primer3_dir, util_dir, probes_summary_dir): cluster = LocalCluster(n_workers=100, threads_per_worker=1) client = Client(cluster) # primer3_dir = '/workdir/hs673/Runs/V1/Samples/HIPRFISH_4/11_27_2018/primer3' # util_dir = '/workdir/hs673/Runs/V1/Samples/HIPRFISH_4/11_27_2018/utilities' probes_filenames = '{}/*_probes.csv'.format(primer3_dir) blast_lineage = pd.read_csv('{}/blast_lineage.tab'.format(util_dir), sep='\t') taxonomic_levels = [ 'phylum', 'class', 'order', 'family', 'genus', 'species' ] blast_lineage_slim = blast_lineage.loc[:, ['molecule_id'] + taxonomic_levels] probes = dd.read_csv(probes_filenames) probes['molecule_id'] = probes.source.apply(get_molecule_id, meta=('str')) probes = probes.merge(blast_lineage_slim, on='molecule_id', how='left') probes['superkingdom'] = 2 extended_taxonomic_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'molecule_id' ] probes_taxa = probes.loc[:, ['seq'] + extended_taxonomic_levels] probes_summary = probes_taxa.groupby('seq').apply( calculate_source, meta=[('superkingdom', 'int'), ('phylum', 'int'), ('class', 'int'), ('order', 'int'), ('family', 'int'), ('genus', 'int'), ('species', 'int'), ('molecule_id', 'str'), ('max_design_level_numeric', 'int'), ('max_design_level', 'str'), ('max_design_target', 'int')]) probes_properties = probes.loc[:, [ 'seq', 'length', 'Tm', 'GC', 'N', 'self_any_th', 'self_end_th', 'hair-pin', 'quality' ]] probes_summary = probes_summary.compute() probes_summary = probes_summary.reset_index() probes_properties = probes_properties.drop_duplicates().compute() probes_summary = probes_summary.merge(probes_properties, on='seq', how='left') client.close() probe_summary_filename = '{}/probes_summary.h5'.format(probes_summary_dir) probes_summary.loc[:, 'max_design_target'] = probes_summary.max_design_target.astype( str) taxonomic_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'molecule_id' ] for i in range(8): probes_summary_working_design_level = probes_summary.loc[ probes_summary.max_design_level_numeric >= i, :] probes_summary_working_design_level.loc[:, 'design_level'] = taxonomic_levels[ i] probes_summary_working_design_level.loc[:, 'design_target'] = probes_summary_working_design_level.loc[:, taxonomic_levels[ i]] probes_summary_working_design_level.groupby( ['design_level', 'design_target']).apply(write_to_hdf, probe_summary_filename) return
def test_dask(): _cluster = LocalCluster( local_dir=os.path.join(ROOT_DIR, "../tests-output/dask-worker-space")) _client = Client(_cluster) def load1(shard): print("load1 {}".format(shard)) ts = [] for i in range(1, 5): ts.append( ["load1_1_{}".format(shard), "load1_1_{}".format(shard), 1]) return ts def load2(shard): print("load2 {}".format(shard)) ts = [] for i in range(1, 5): ts.append( ["load2_1_{}".format(shard), "load2_1_{}".format(shard), 1]) return ts def project(ts): print("project {}".format(ts)) return map(lambda t: [t[0], t[1], t[2]], ts) def join(ts1, ts2): print("join {}, {}".format(ts1, ts2)) return map(lambda t: ts1[0] + ts2[0], ts1) def agg(ts): print("agg {}".format(ts)) pr = map(lambda t: t[2], ts) return sum(pr) def reduce_(xs): print("reduce {}".format(xs)) return sum(xs) data = [1, 2, 3, 4, 5] tuples = [] for s in data: tuples1 = dask.delayed(load1)(s) tuples2 = dask.delayed(load2)(s) projected_tuples1 = dask.delayed(project)(tuples1) projected_tuples2 = dask.delayed(project)(tuples2) joined_tuples = dask.delayed(join)(projected_tuples1, projected_tuples2) aggs = dask.delayed(agg)(joined_tuples) tuples.append(aggs) total = dask.delayed(reduce_)(tuples) # output = [] # for x in data: # a = dask.delayed(inc)(x) # for y in a: # b = dask.delayed(double)(y) # c = dask.delayed(add)(a, b) # output.append(c) # # total = dask.delayed(sum)(output) total.visualize( filename=os.path.join(ROOT_DIR, "../tests-output/mydask.png")) res = total.compute() print(res) _client.close() _cluster.close()
def test_uniform_prior(): lb = np.asarray([1, 1]) ub = np.asarray([5, 5]) num_samples = 5 prior_func = uniform_prior.UniformPrior(lb, ub) # multiprocessing mode samples = prior_func.draw(num_samples, chunk_size=1) assert len( samples ) == 5, "UniformPrior functional test error, expected chunk count mismatch" samples, = dask.compute(samples) samples = np.asarray(samples) assert samples.shape[ 0] == num_samples, "UniformPrior functional test error, expected sample count mismatch" assert samples.shape[ 1] == 1, "UniformPrior functional test error, expected chunk size mismatch" assert samples.shape[2] == len( lb), "UniformPrior functional test error, dimension mismatch" samples = samples.reshape(-1, len(lb)) axis_mins = np.min(samples, 0) axis_maxs = np.max(samples, 0) assert axis_mins[0] > lb[0] and axis_maxs[0] < ub[0] and axis_mins[1] > lb[1] and axis_maxs[1] < ub[1], \ "UniformPrior functional test error, drawn samples out of bounds" # Cluster mode c = Client() samples = prior_func.draw(num_samples, chunk_size=1) assert len( samples ) == 5, "UniformPrior functional test error, expected chunk count mismatch" samples, = dask.compute(samples) samples = np.asarray(samples) assert samples.shape[ 0] == num_samples, "UniformPrior functional test error, expected sample count mismatch" assert samples.shape[ 1] == 1, "UniformPrior functional test error, expected chunk size mismatch" assert samples.shape[2] == len( lb), "UniformPrior functional test error, dimension mismatch" samples = samples.reshape(-1, len(lb)) axis_mins = np.min(samples, 0) axis_maxs = np.max(samples, 0) assert axis_mins[0] > lb[0] and axis_maxs[0] < ub[0] and axis_mins[1] > lb[1] and axis_maxs[1] < ub[1], \ "UniformPrior functional test error, drawn samples out of bounds" # chunk_size = 2 samples = prior_func.draw(num_samples, chunk_size=2) assert len( samples ) == 3, "UniformPrior functional test error, expected chunk count mismatch" samples, = dask.compute(samples) samples = np.asarray(samples) assert samples.shape[ 0] == 3, "UniformPrior functional test error, expected sample count mismatch" assert samples[-1].shape[ 0] == 2, "UniformPrior functional test error, expected chunk size mismatch" assert samples[-1].shape[1] == len( lb), "UniformPrior functional test error, dimension mismatch" samples = core._reshape_chunks(samples) axis_mins = np.min(samples, 0) axis_maxs = np.max(samples, 0) assert axis_mins[0] > lb[0] and axis_maxs[0] < ub[0] and axis_mins[1] > lb[1] and axis_maxs[1] < ub[1], \ "UniformPrior functional test error, drawn samples out of bounds" c.close()
def run_tasks(pl_conf, task_type, task_fn, logging_init_fn): # Initialize local dask cluster logger.debug('Pipeline configuration: %s', pl_conf) cluster = LocalCluster(n_workers=pl_conf.n_workers, threads_per_worker=1, processes=True, memory_limit=pl_conf.memory_limit, ip='0.0.0.0') client = Client(cluster) # Split total region + tile indexes to process into separate lists for each worker # (by indexes of those combinations) tiles = pl_conf.region_tiles idx_batches = np.array_split(np.arange(len(tiles)), pl_conf.n_workers) # Assign gpus to tasks in round-robin fashion def get_gpu(i): if pl_conf.gpus is None: return None return pl_conf.gpus[i % len(pl_conf.gpus)] # Generate a single task configuration for each worker tasks = [ pl_conf.get_task_config(region_indexes=tiles[idx_batch, 0], tile_indexes=tiles[idx_batch, 1], gpu=get_gpu(i)) for i, idx_batch in enumerate(idx_batches) ] logger.info('Starting %s pipeline for %s tasks (%s workers)', task_type, len(tasks), pl_conf.n_workers) logger.debug('Task definitions:\n\t%s', '\n\t'.join([str(t) for t in tasks])) try: # Passing logging initialization operation, if given, to workers now # running in separate processes if logging_init_fn: client.run(logging_init_fn) # Disable the "auto_restart" feature of dask workers which is of no use in this context for worker in cluster.workers: worker.auto_restart = False # Pass tasks to each worker to execute in parallel res = client.map(task_fn, tasks) res = [r.result() for r in res] if len(res) != len(tasks): raise ValueError( 'Parallel execution returned {} results but {} were expected'. format(len(res), len(tasks))) finally: # Note that this often produces a non-critical error due to: https://github.com/dask/distributed/issues/1969 # but that closing these resources is necessary to avoid GPU oom in post-processing client.close() cluster.close() # Save measurement data to disk measure_data = concat(res) if measure_data: path = exec.record_processor_data(measure_data, pl_conf.output_dir) logging.info('%s complete; Measurement data saved to "%s"', task_type, path) else: logging.info('%s complete', task_type)
def client(): client = Client(LocalCluster(n_workers=2)) yield client client.close()
def eval_denoisers( run_ids, job_name='eval_denoisers', contrast='CORPD_FBK', n_epochs=200, model_name=None, model_size=None, n_samples_train=None, loss='mae', ): model_specs = list(get_model_specs(force_res=True)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='2:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(eval_cluster) futures = [ client.submit( # function to execute evaluate_xpdnet_denoising, model=(model_fun, kwargs, n_inputs), run_id=run_id, n_samples=50, contrast=contrast, n_epochs=n_epochs, ) for run_id, (_, _, model_fun, kwargs, n_inputs, _, _) in zip(run_ids, model_specs) ] df_results = pd.DataFrame( columns='model_name model_size psnr ssim'.split()) for (name, model_size, _, _, _, _, _), future in zip(model_specs, futures): _, eval_res = client.gather(future) df_results = df_results.append(dict( model_name=name, model_size=model_size, psnr=eval_res[0], ssim=eval_res[1], ), ignore_index=True) print(df_results) outputs_file = f'denoising_results_{n_samples_train}_{loss}.csv' if model_name is not None: outputs_file = f'denoising_results_{n_samples_train}_{loss}_{model_name}.csv' df_results.to_csv(outputs_file) print('Shutting down dask workers') client.close() eval_cluster.close() return run_ids
hdf_file=savefiles['linemin'], client=client, verbose=True) # First excited state wfs = [sys['wf'], deepcopy(sys['wf'])] optimize_orthogonal(wfs, coords, sys['pgrad'], hdf_file=savefiles['excited1'], forcing=[5.0], Starget=[0.0], client=client) # Second excited state wfs.append(deepcopy(sys['wf'])) optimize_orthogonal(wfs, coords, sys['pgrad'], hdf_file=savefiles['excited2'], forcing=[5.0, 5.0], Starget=[0.0, 0.0], client=client) for nm in ['linemin', 'excited1', 'excited2']: print("evaluating", nm) find_basis_evaluate(savefiles['mf'], savefiles[nm], savefiles[nm + '_vmc'], savefiles[nm + '_final']) client.close()
def balance_trainingdata(label_file, reference_file, genome_file, output_file, precision, label_num, bgzip, log_file, verbose, seed, cores=1, memory_per_core='2GB'): """ Function that calculates the GC content for positive and negative labeled genomic regions and balances their number based on GC content per chromosome. Arguments: label_file {str} -- [Path to a .bed file containing genomic regions labeled as positive(1) or negative(0)] reference_file {str} -- [Path to a reference genome in FASTA format] genome_file {str} -- [Path to the genome file of the reference] output_file {str} -- [Name of the output file. File will be in .bed format] precision {int} -- [Precision of decimals when computing the attributes like GC content] label_num {int} -- [Number of provided label columns] bgzip {boolean} -- [output is compressed or not] log_file {str} -- [Log file to write out loggin. If not None.] verbose {flag} -- [Enables verbose mode.] seed {int} -- [Sets the seed for sampling.] cores {int} -- [Number of cores, default is 1. ] memory_per_core {str} -- [Amount of memory per core. Accepted format [number]GB. Default is 2GB] """ loglevel = logging.INFO logformat = '%(message)s' if verbose: loglevel = logging.DEBUG logformat = "%(asctime)s: %(levelname)s - %(message)s" if log_file is not None: logging.basicConfig(filename=log_file, level=loglevel, format=logformat) elif output_file is not None: logging.basicConfig(stream=sys.stdout, level=loglevel, format=logformat) else: logging.basicConfig(stream=sys.stderr, level=loglevel, format=logformat) logging.info( "---------------------\nstarting workers...\n---------------------") client = Client(n_workers=cores, threads_per_worker=1, memory_limit=memory_per_core, dashboard_address=None) client # pylint: disable=pointless-statement logging.info( "---------------------\ncalculating GC content...\n---------------------" ) cl_gc = get_gc(label_file, reference_file, label_num, precision) logging.info( "---------------------\nextracting positive samples...\n---------------------" ) positive_sample = get_positive(cl_gc) logging.info( "---------------------\nbalancing negative sample set...\n---------------------" ) dts = dict(cl_gc.dtypes) negative_sample = (cl_gc.groupby(["chrom"], group_keys=False).apply( get_negative, seed, meta=dts)).compute() logging.info( "---------------------\nloading contigs...\n---------------------") contigs = load_contigs(genome_file) # print(contigs) logging.info( "---------------------\ncleaning samples\n---------------------") positive_sample_cleaned = clean_sample(positive_sample, contigs) negative_sample_cleaned = clean_sample(negative_sample, contigs) logging.info( "---------------------\nsaving results\n---------------------") sample_df = combine_samples(positive_sample_cleaned, negative_sample_cleaned) #print(sample_df.head()) if output_file: write_to_file(sample_df, output_file, bgzip) else: write_to_stdout(sample_df, precision) logging.info( "---------------------\nshutting down workers...\n---------------------" ) client.close()
class DaskParallelRunner(BaseRunner): """Interface to submit and collect a job in a distributed fashion. DaskParallelRunner is intended to comply with the bridge design pattern. Nevertheless, to reduce the amount of code within single-vs-parallel implementations, DaskParallelRunner wraps a BaseRunner object which is then executed in parallel on n_workers. This class then is constructed by passing a BaseRunner that implements a run() method, and is capable of doing so in a serial fashion. Then, this wrapper class called DaskParallelRunner uses dask to initialize N number of BaseRunner that actively wait of a RunInfo to produce a RunValue object. To be more precise, the work model is then: 1. The smbo.intensifier dictates "what" to run (a configuration/instance/seed) via a RunInfo object. 2. a tae_runner takes this RunInfo object and launches the task via tae_runner.submit_run(). In the case of DaskParallelRunner, n_workers receive a pickle-object of DaskParallelRunner.single_worker, each with a run() method coming from DaskParallelRunner.single_worker.run() 3. RunInfo objects are run in a distributed fashion, an their results are available locally to each worker. Such result is collected by DaskParallelRunner.get_finished_runs() and then passed to the SMBO. 4. Exceptions are also locally available to each worker and need to be collected. Dask works with Future object which are managed via the DaskParallelRunner.client. Parameters --------- single_worker: BaseRunner A runner to run in a distributed fashion n_workers: int Number of workers to use for distributed run. Will be ignored if ``dask_client`` is not ``None``. patience: int How much to wait for workers to be available if one fails output_directory: str, optional If given, this will be used for the dask worker directory and for storing server information. If a dask client is passed, it will only be used for storing server information as the worker directory must be set by the program/user starting the workers. dask_client: dask.distributed.Client User-created dask client, can be used to start a dask cluster and then attach SMAC to it. Attributes ---------- results ta stats run_obj par_factor cost_for_crash abort_i_first_run_crash n_workers futures client """ def __init__( self, single_worker: BaseRunner, n_workers: int, patience: int = 5, output_directory: typing.Optional[str] = None, dask_client: typing.Optional[dask.distributed.Client] = None, ): super(DaskParallelRunner, self).__init__( ta=single_worker.ta, stats=single_worker.stats, multi_objectives=single_worker.multi_objectives, run_obj=single_worker.run_obj, par_factor=single_worker.par_factor, cost_for_crash=single_worker.cost_for_crash, abort_on_first_run_crash=single_worker.abort_on_first_run_crash, ) # The single worker, which is replicated on a need # basis to every compute node self.single_worker = single_worker self.n_workers = n_workers # How much time to wait for workers to be available self.patience = patience self.output_directory = output_directory # Because a run() method can have pynisher, we need to prevent the multiprocessing # workers to be instantiated as demonic - this cannot be passed via worker_kwargs dask.config.set({'distributed.worker.daemon': False}) if dask_client is None: self.close_client_at_del = True self.client = Client(n_workers=self.n_workers, processes=True, threads_per_worker=1, local_directory=output_directory) if self.output_directory: self.scheduler_file = os.path.join(self.output_directory, '.dask_scheduler_file') self.client.write_scheduler_file( scheduler_file=self.scheduler_file) else: self.close_client_at_del = False self.client = dask_client self.futures = [] # type: typing.List[Future] self.scheduler_info = self.client._get_scheduler_info() def submit_run(self, run_info: RunInfo) -> None: """This function submits a configuration embedded in a run_info object, and uses one of the workers to produce a result locally to each worker. The execution of a configuration follows this procedure: 1. SMBO/intensifier generates a run_info 2. SMBO calls submit_run so that a worker launches the run_info 3. submit_run internally calls self.run(). it does so via a call to self.run_wrapper() which contains common code that any run() method will otherwise have to implement, like capping check. Child classes must implement a run() method. All results will be only available locally to each worker, so the main node needs to collect them. Parameters ---------- run_info: RunInfo An object containing the configuration and the necessary data to run it """ # Check for resources or block till one is available if not self._workers_available(): wait(self.futures, return_when='FIRST_COMPLETED').done self._extract_completed_runs_from_futures() # In code check to make sure that there are resources if not self._workers_available(): warnings.warn( "No workers are available. This could mean workers crashed" "Waiting for new workers...") time.sleep(self.patience) if not self._workers_available(): raise ValueError( "Tried to execute a job, but no worker was " "available. This likely means that a worker crashed " "or no workers were properly configured.") # At this point we can submit the job # For `pure=False`, see # http://distributed.dask.org/en/stable/client.html#pure-functions-by-default self.futures.append( self.client.submit(self.single_worker.run_wrapper, run_info, pure=False)) def get_finished_runs( self) -> typing.List[typing.Tuple[RunInfo, RunValue]]: """This method returns any finished configuration, and returns a list with the results of exercising the configurations. This class keeps populating results to self.results until a call to get_finished runs is done. In this case, the self.results list is emptied and all RunValues produced by running run() are returned. Returns ------- List[RunInfo, RunValue]: A list of RunValues (and respective RunInfo), that is, the results of executing a run_info a submitted configuration """ # Proactively see if more configs have finished self._extract_completed_runs_from_futures() results_list = [] while self.results: results_list.append(self.results.pop()) return results_list def _extract_completed_runs_from_futures(self) -> None: """ A run is over, when a future has done() equal true. This function collects the completed futures and move them from self.futures to self.results. We make sure futures never exceed the capacity of the scheduler """ # In code check to make sure we don;t exceed resource allocation if len(self.futures) > sum(self.client.nthreads().values()): warnings.warn( "More running jobs than resources available " "Should not have more futures/runs in remote workers " "than the number of workers. This could mean a worker " "crashed and was not able to be recovered by dask. ") # A future is removed to the list of futures as an indication # that a worker is available to take in an extra job done_futures = [f for f in self.futures if f.done()] for future in done_futures: self.results.append(future.result()) self.futures.remove(future) def wait(self) -> None: """SMBO/intensifier might need to wait for runs to finish before making a decision. This class waits until 1 run completes """ if self.futures: wait(self.futures, return_when='FIRST_COMPLETED').done def pending_runs(self) -> bool: """ Whether or not there are configs still running. Generally if the runner is serial, launching a run instantly returns it's result. On parallel runners, there might be pending configurations to complete. """ # If there are futures available, it translates # to runs still not finished/processed return len(self.futures) > 0 def run( self, config: Configuration, instance: str, cutoff: typing.Optional[float] = None, seed: int = 12345, budget: typing.Optional[float] = None, instance_specific: str = "0", ) -> typing.Tuple[StatusType, float, float, typing.Dict]: """ This method only complies with the abstract parent class. In the parallel case, we call the single worker run() method Parameters ---------- config : Configuration dictionary param -> value instance : string problem instance cutoff : float, optional Wallclock time limit of the target algorithm. If no value is provided no limit will be enforced. seed : int random seed budget : float, optional A positive, real-valued number representing an arbitrary limit to the target algorithm. Handled by the target algorithm internally instance_specific: str instance specific information (e.g., domain file or solution) Returns ------- status: enum of StatusType (int) {SUCCESS, TIMEOUT, CRASHED, ABORT} cost: float cost/regret/quality (float) (None, if not returned by TA) runtime: float runtime (None if not returned by TA) additional_info: dict all further additional run information """ return self.single_worker.run( config=config, instance=instance, cutoff=cutoff, seed=seed, budget=budget, instance_specific=instance_specific, ) def num_workers(self) -> int: """Total number of workers available. This number is dynamic as more resources can be allocated""" return sum(self.client.nthreads().values()) def _workers_available(self) -> bool: """"Query if there are workers available, which means that there are resources to launch a dask job""" total_compute_power = sum(self.client.nthreads().values()) if len(self.futures) < total_compute_power: return True return False def __del__(self) -> None: """Make sure that when this object gets deleted, the client is terminated. This is only done if the client was created by the dask runner.""" if self.close_client_at_del: self.client.close()
args.bin_width, args.filter_bandwidth, args.theta, args.shift ): start_time = datetime.now() ds = xr.open_mfdataset( ['/scratch/pkittiwi/fg1p/binned_noise_map/bin{:.2f}MHz/' 'fbw{:.2f}MHz/theta{:.1f}/shift{:d}/binned_noise_map_bin{:.2f}MHz_' 'fbw{:.2f}MHz_theta{:.1f}_shift{:d}_{:03d}.nc' .format(bw, fbw, t, s, bw, fbw, t, s, i) for i in range(500)], concat_dim=noise_dim, chunks=chunks ) ds_masked = ds.where(mask == 1) noise_var = ds_masked.var(dim=['x', 'y']).compute() da = noise_var.to_array().squeeze() del da['variable'] da.attrs['bin_width'] = bw da.attrs['filter_bandwidth'] = fbw da.attrs['theta'] = t da.attrs['shift'] = s da.name = 'binned_noise_variance' output_file = '/scratch/pkittiwi/fg1p/binned_noise_variance/' \ 'binned_noise_variance_bin{:.2f}MHz_fbw{:.2f}MHz_' \ 'theta{:.1f}_shift{:d}.nc'.format(bw, fbw, t, s) da.to_netcdf(output_file) print( 'Finish {:s}. Spent {:.5f} sec on this file' .format(output_file, (datetime.now() - start_time).total_seconds()) ) client.close() cluster.close()
def run_simulation(cfg, client_ip=None): """ main function :param cfg: an instance of TransmissionConfig """ tic = time.time() logger = logging.getLogger(__name__) if cfg.options['run_parallel']: logger.info('parallel MC run on.......') client = Client(client_ip) lines = [] for event in cfg.events: scenario = Scenario(event=event, cfg=cfg) for _, line in scenario.lines.items(): line = client.submit(line.compute_damage_per_line, cfg=cfg) lines.append(line) client.gather(lines) client.close() else: logger.info('serial MC run on.......') # create transmission network with wind event lines = [] for event in cfg.events: scenario = Scenario(event=event, cfg=cfg) damage_prob_max = pd.DataFrame(None, columns=cfg.damage_states) for _, line in scenario.lines.items(): _ = line.compute_damage_per_line(cfg=cfg) df = pd.DataFrame(None, columns=cfg.damage_states) for ds in cfg.damage_states: try: tmp = line.damage_prob[ds].max(axis=0) except KeyError: pass else: df[ds] = tmp damage_prob_max = damage_prob_max.append(df) if not damage_prob_max.empty: damage_prob_max.index.name = 'name' damage_prob_max.to_csv(scenario.file_output) logger.info(f'{scenario.file_output} is saved') if cfg.line_interaction: _ = scenario.compute_damage_probability_line_interaction() logger.info(f'MC simulation took {time.time() - tic} seconds') return lines