class MyDaskClient(): def __init__(self, address=None): self._client = Client(address) def _who_has(self, key): who_has_dict = self._client.who_has() if key in who_has_dict: return {"key": key, "worker": who_has_dict[key]} def get_status(self, key): # first we check if a worker has it processing_dict = self._client.processing() for worker in processing_dict.keys(): if key in processing_dict[worker]: return {"status": "running", "worker": worker} # then we check if the task is in the stream for task in reversed(self._client.get_task_stream()): if task["key"] == key: return {"status": "done", "dask_status": task["status"]}
def main(): #print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC']) parser = argparse.ArgumentParser("rapidssample") parser.add_argument("--data_dir", type=str, help="location of data") parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1) parser.add_argument("--part_count", type=int, help="Number of data files to train against", default=2) parser.add_argument("--end_year", type=int, help="Year to end the data load", default=2000) parser.add_argument("--cpu_predictor", type=str, help="Flag to use CPU for prediction", default='False') parser.add_argument('-f', type=str, default='') # added for notebook execution scenarios args = parser.parse_args() data_dir = args.data_dir num_gpu = args.num_gpu part_count = args.part_count end_year = args.end_year cpu_predictor = args.cpu_predictor.lower() in ('yes', 'true', 't', 'y', '1') if cpu_predictor: print('Training with CPUs require num gpu = 1') num_gpu = 1 print('data_dir = {0}'.format(data_dir)) print('num_gpu = {0}'.format(num_gpu)) print('part_count = {0}'.format(part_count)) #part_count = part_count + 1 # adding one because the usage below is not inclusive print('end_year = {0}'.format(end_year)) print('cpu_predictor = {0}'.format(cpu_predictor)) import subprocess cmd = "hostname --all-ip-addresses" process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) output, error = process.communicate() IPADDR = str(output.decode()).split()[0] cluster = LocalCUDACluster(ip=IPADDR,n_workers=num_gpu) client = Client(cluster) client print(client.ncores()) # to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq" perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf" col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv" start_year = 2000 #end_year = 2000 # end_year is inclusive -- converted to parameter #part_count = 2 # the number of data files to train against -- converted to parameter client.run(initialize_rmm_pool) client print(client.ncores()) # NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix. # This can be optimized to avoid calculating the dropped features. print("Reading ...") t1 = datetime.datetime.now() gpu_dfs = [] gpu_time = 0 quarter = 1 year = start_year count = 0 while year <= end_year: for file in glob(os.path.join(perf_data_path + "/Performance_" + str(year) + "Q" + str(quarter) + "*")): if count < part_count: gpu_dfs.append(process_quarter_gpu(client, col_names_path, acq_data_path, year=year, quarter=quarter, perf_file=file)) count += 1 print('file: {0}'.format(file)) print('count: {0}'.format(count)) quarter += 1 if quarter == 5: year += 1 quarter = 1 wait(gpu_dfs) t2 = datetime.datetime.now() print("Reading time ...") print(t2-t1) print('len(gpu_dfs) is {0}'.format(len(gpu_dfs))) client.run(cudf._gdf.rmm_finalize) client.run(initialize_rmm_no_pool) client print(client.ncores()) dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', 'objective': 'gpu:reg:linear', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } if cpu_predictor: print('Training using CPUs') dxgb_gpu_params['predictor'] = 'cpu_predictor' dxgb_gpu_params['tree_method'] = 'hist' dxgb_gpu_params['objective'] = 'reg:linear' else: print('Training using GPUs') print('Training parameters are {0}'.format(dxgb_gpu_params)) gpu_dfs = [delayed(DataFrame.from_arrow)(gpu_df) for gpu_df in gpu_dfs[:part_count]] gpu_dfs = [gpu_df for gpu_df in gpu_dfs] wait(gpu_dfs) tmp_map = [(gpu_df, list(client.who_has(gpu_df).values())[0]) for gpu_df in gpu_dfs] new_map = {} for key, value in tmp_map: if value not in new_map: new_map[value] = [key] else: new_map[value].append(key) del(tmp_map) gpu_dfs = [] for list_delayed in new_map.values(): gpu_dfs.append(delayed(cudf.concat)(list_delayed)) del(new_map) gpu_dfs = [(gpu_df[['delinquency_12']], gpu_df[delayed(list)(gpu_df.columns.difference(['delinquency_12']))]) for gpu_df in gpu_dfs] gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs] gpu_dfs = [dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs] gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs] gc.collect() wait(gpu_dfs) labels = None t1 = datetime.datetime.now() bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround']) t2 = datetime.datetime.now() print("Training time ...") print(t2-t1) print('str(bst) is {0}'.format(str(bst))) print('Exiting script')
} # #### Load the data from host memory, and convert to CSR # In[ ]: # %%time gpu_dfs = [ delayed(DataFrame.from_arrow)(gpu_df) for gpu_df in gpu_dfs[:part_count] ] gpu_dfs = [gpu_df for gpu_df in gpu_dfs] wait(gpu_dfs) tmp_map = [(gpu_df, list(client.who_has(gpu_df).values())[0]) for gpu_df in gpu_dfs] new_map = {} for key, value in tmp_map: if value not in new_map: new_map[value] = [key] else: new_map[value].append(key) del (tmp_map) gpu_dfs = [] for list_delayed in new_map.values(): gpu_dfs.append(delayed(cudf.concat)(list_delayed)) del (new_map) gpu_dfs = [
c.gather(c.map(lambda x: get_worker().array_cache.clear(),workers,workers=workers)) # start c.get_task_stream() # print(get_mll_hist(chunks[0])) t0 = time.time() futures = c.map(get_mll_hist,chunks) results = c.gather(futures) t1 = time.time() print(len(results),"results") print(t1-t0) task_stream = c.get_task_stream(start=t0,stop=t1) print("task_stream length",len(task_stream)) pd.DataFrame(task_stream).drop("type",axis=1).to_json("data/dask_cold_{}.json".format(trial)) d = c.who_has(futures) # chunk_workers = list(zip(chunks,[d[f.key] for f in futures])) workers = [d[f.key][0] for f in futures] print(workers) c.get_task_stream() t0 = time.time() # pure=False to avoid caching of the *results* futures = [c.submit(get_mll_hist,chunk,pure=False,workers=worker,allow_other_workers=True) for chunk,worker in zip(chunks,workers)] # futures = c.map(get_mll_hist,chunks,workers=workers,pure=False) results = c.gather(futures) t1 = time.time() print(len(results),"results") print(t1-t0) task_stream = c.get_task_stream(start=t0,stop=t1) print("task_stream length",len(task_stream))