def mortgage_gquant_run(run_params_dict): '''Using dataframe-flow runs the tasks/workflow specified in the run_params_dict. Expected run_params_dict ex: run_params_dict = { 'replace_spec': replace_spec, 'task_spec_list': gquant_task_spec_list, 'out_list': out_list } gquant_task_spec_list - Mortgage ETL workflow list of task-specs. Refer to module mortgage_common function mortgage_etl_workflow_def. out_list - Expected to specify one output which should be the final dataframe produced by the mortgage ETL workflow. :param run_params_dict: Dictionary with parameters and gquant task list to run mortgage workflow. ''' from gquant.dataframe_flow import TaskGraph task_spec_list = run_params_dict['task_spec_list'] out_list = run_params_dict['out_list'] replace_spec = run_params_dict['replace_spec'] task_graph = TaskGraph(task_spec_list) (final_perf_acq_df, ) = task_graph.run(out_list, replace_spec) return final_perf_acq_df
def test_columns_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnotnums'} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(LookupError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Task "numproc" column "list" expected type "numbers" got '\ 'type "notnumbers" instead.' self.assertIn(errmsg, outerr_msg)
def test_columns_name_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'rangenums'} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(LookupError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Task "numproc" missing required column "list" from '\ '"numgen.numlist".' self.assertIn(errmsg, outerr_msg)
def test_ports_connection_subclass_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'} numproc_spec[TaskSpecSchema.conf] = {'port_type': MyList} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(TypeError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Connected nodes do not have matching port types. '\ 'Fix port types.' self.assertIn(errmsg, outerr_msg)
def test_ports_output_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = { 'columns_option': 'listnums', 'out_type': 'rangenums' } tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(TypeError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Node "numgen" output port "numlist" produced wrong type '\ '"<class \'range\'>". Expected type "[<class \'list\'>]"' self.assertEqual(errmsg, outerr_msg)
def test_columns_and_ports_types_match(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'} tspec_list = [numgen_spec, numproc_spec] tgraph_valid = TaskGraph(tspec_list) sumout, = tgraph_valid.run(['numproc.sum']) self.assertEqual(sumout, 45)
def test_ports_connection_subclass_type_match(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = { 'port_type': MyList, 'columns_option': 'mylistnums' } numproc_spec[TaskSpecSchema.conf] = {'port_type': list} tspec_list = [numgen_spec, numproc_spec] tgraph_valid = TaskGraph(tspec_list) sumout, = tgraph_valid.run(['numproc.sum']) self.assertEqual(sumout, 45)
class TestTaskGraphAPI(unittest.TestCase): def setUp(self): import gc # python garbage collector import cudf # warmup s = cudf.Series([1, 2, 3, None, 4], nan_as_null=False) del(s) gc.collect() os.environ['GQUANT_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes' points_task_spec = { TaskSpecSchema.task_id: 'points_task', TaskSpecSchema.node_type: 'PointNode', TaskSpecSchema.conf: {'npts': 1000}, TaskSpecSchema.inputs: [] } distance_task_spec = { TaskSpecSchema.task_id: 'distance_by_cudf', TaskSpecSchema.node_type: 'DistanceNode', TaskSpecSchema.conf: {}, TaskSpecSchema.inputs: { 'points_df_in': 'points_task.points_df_out' } } tspec_list = [points_task_spec, distance_task_spec] self.tgraph = TaskGraph(tspec_list) # Create a temporary directory self._test_dir = tempfile.mkdtemp() os.environ['GQUANT_CACHE_DIR'] = os.path.join(self._test_dir, '.cache') def tearDown(self): global DEFAULT_MODULE os.environ['GQUANT_PLUGIN_MODULE'] = DEFAULT_MODULE os.environ['GQUANT_CACHE_DIR'] = Node.cache_dir shutil.rmtree(self._test_dir) @ordered def test_viz_graph(self): '''Test taskgraph to networkx graph conversion for graph visualization. ''' nx_graph = self.tgraph.viz_graph(show_ports=True) nx_nodes = ['points_task', 'points_task.points_df_out', 'points_task.points_ddf_out', 'distance_by_cudf', 'distance_by_cudf.distance_df', 'distance_by_cudf.distance_abs_df'] nx_edges = [('points_task', 'points_task.points_df_out'), ('points_task', 'points_task.points_ddf_out'), ('points_task.points_df_out', 'distance_by_cudf'), ('distance_by_cudf', 'distance_by_cudf.distance_df'), ('distance_by_cudf', 'distance_by_cudf.distance_abs_df')] self.assertEqual(list(nx_graph.nodes), nx_nodes) self.assertEqual(list(nx_graph.edges), nx_edges) @ordered def test_build(self): '''Test build of a taskgraph and that all inputs and outputs are set for the tasks withink a taskgraph. ''' self.tgraph.build() points_node = self.tgraph['points_task'] distance_node = self.tgraph['distance_by_cudf'] onode_info = { 'to_node': distance_node, 'to_port': 'points_df_in', 'from_port': 'points_df_out' } self.assertIn(onode_info, points_node.outputs) onode_cols = {'points_df_out': {'x': 'float64', 'y': 'float64'}, 'points_ddf_out': {'x': 'float64', 'y': 'float64'}} self.assertEqual(onode_cols, points_node.meta_setup().outports) inode_info = { 'from_node': points_node, 'from_port': 'points_df_out', 'to_port': 'points_df_in' } self.assertIn(inode_info, distance_node.inputs) inode_in_cols = { 'points_df_in': { 'x': 'float64', 'y': 'float64' } } self.assertEqual(inode_in_cols, distance_node.get_input_meta()) inode_out_cols = {'distance_df': {'distance_cudf': 'float64', 'x': 'float64', 'y': 'float64'}, 'distance_abs_df': {'distance_abs_cudf': 'float64', 'x': 'float64', 'y': 'float64'}} self.assertEqual(inode_out_cols, distance_node.meta_setup().outports) @ordered def test_run(self): '''Test that a taskgraph can run successfully. ''' outlist = ['distance_by_cudf.distance_df'] # Using numpy random seed to get repeatable and deterministic results. # For seed 2335 should get something around 761.062831178. replace_spec = { 'points_task': { TaskSpecSchema.conf: { 'npts': 1000, 'nseed': 2335 } } } (dist_df_w_cudf, ) = self.tgraph.run( outputs=outlist, replace=replace_spec) dist_sum = dist_df_w_cudf['distance_cudf'].sum() # self.assertAlmostEqual(dist_sum, 0.0, places, msg, delta) self.assertAlmostEqual(dist_sum, 761.062831178) # match to 7 places @ordered def test_save(self): '''Test that a taskgraph can be save to a yaml file. ''' workflow_file = os.path.join(self._test_dir, 'test_save_taskgraph.yaml') self.tgraph.save_taskgraph(workflow_file) with open(workflow_file) as wf: workflow_str = wf.read() # verify the workflow contentst same as expected. Empty list if same. global TASKGRAPH_YAML cdiff = list(context_diff(TASKGRAPH_YAML, workflow_str)) cdiff_empty = cdiff == [] err_msg = 'Taskgraph yaml contents do not match expected results.\n'\ 'SHOULD HAVE SAVED:\n\n'\ '{wyaml}\n\n'\ 'INSTEAD FILE CONTAINS:\n\n'\ '{fcont}\n\n'\ 'DIFF:\n\n'\ '{diff}'.format(wyaml=TASKGRAPH_YAML, fcont=workflow_str, diff=''.join(cdiff)) self.assertTrue(cdiff_empty, err_msg) @ordered def test_load(self): '''Test that a taskgraph can be loaded from a yaml file. ''' workflow_file = os.path.join(self._test_dir, 'test_load_taskgraph.yaml') global TASKGRAPH_YAML with open(workflow_file, 'w') as wf: wf.write(TASKGRAPH_YAML) tspec_list = [task._task_spec for task in self.tgraph] tgraph = TaskGraph.load_taskgraph(workflow_file) all_tasks_exist = True for task in tgraph: if task._task_spec not in tspec_list: all_tasks_exist = False break with StringIO() as yf: yaml.dump(tspec_list, yf, default_flow_style=False, sort_keys=False) yf.seek(0) err_msg = 'Load taskgraph failed. Missing expected task items.\n'\ 'EXPECTED TASKGRAPH YAML:\n\n'\ '{wyaml}\n\n'\ 'GOT TASKS FORMATTED AS YAML:\n\n'\ '{tlist}\n\n'.format(wyaml=TASKGRAPH_YAML, tlist=yf.read()) self.assertTrue(all_tasks_exist, err_msg) @ordered def test_save_load_cache(self): '''Test caching of tasks outputs within a taskgraph. 1. Save points_task output to cache when running the taskgraph. 2. Load points_task df from cache when running the taskgraph. ''' replace_spec = {'points_task': {TaskSpecSchema.save: True}} outlist = ['distance_by_cudf.distance_df'] with warnings.catch_warnings(): # ignore UserWarning: Using CPU via Pandas to write HDF dataset warnings.filterwarnings( 'ignore', message='Using CPU via Pandas to write HDF dataset', category=UserWarning,) # ignore RuntimeWarning: numpy.ufunc size changed warnings.filterwarnings('ignore', category=RuntimeWarning, message='numpy.ufunc size changed') (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec) cache_dir = os.path.join(self._test_dir, '.cache', 'points_task.hdf5') self.assertTrue(os.path.exists(cache_dir)) replace_spec = {'points_task': {TaskSpecSchema.load: True}} with warnings.catch_warnings(): # ignore UserWarning: Using CPU via Pandas to read HDF dataset warnings.filterwarnings( 'ignore', message='Using CPU via Pandas to read HDF dataset', category=UserWarning) (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)
def main(): _basedir = os.path.dirname(__file__) # mortgage_data_path = '/datasets/rapids_data/mortgage' mortgage_data_path = os.path.join(_basedir, 'mortgage_data') # Using some default csv files for testing. # csvfile_names = os.path.join(mortgage_data_path, 'names.csv') # acq_data_path = os.path.join(mortgage_data_path, 'acq') # perf_data_path = os.path.join(mortgage_data_path, 'perf') # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt') # csvfile_perfdata = \ # os.path.join(perf_data_path, 'Performance_2000Q1.txt_0') # mortgage_etl_workflow_def( # csvfile_names, csvfile_acqdata, csvfile_perfdata) gquant_task_spec_list = mortgage_etl_workflow_def() start_year = 2000 end_year = 2001 # end_year is inclusive # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against part_count = 12 # the number of data files to train against # part_count = 4 # the number of data files to train against mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list( mortgage_data_path, start_year, end_year, part_count, gquant_task_spec_list) _basedir = os.path.dirname(__file__) mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py') mortgage_workflow_runner_task = { TaskSpecSchema.task_id: MortgageTaskNames.mortgage_workflow_runner_task_name, TaskSpecSchema.node_type: 'MortgageWorkflowRunner', TaskSpecSchema.conf: { 'mortgage_run_params_dict_list': mortgage_run_params_dict_list }, TaskSpecSchema.inputs: [], TaskSpecSchema.filepath: mortgage_lib_module } # Can be multi-gpu. Set ngpus > 1. This is different than dask xgboost # which is distributed multi-gpu i.e. dask-xgboost could distribute on one # node or multiple nodes. In distributed mode the dmatrix is disributed. ngpus = 1 xgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': ngpus, # 'distributed_dask': True, 'loss': 'ls', # 'objective': 'gpu:reg:linear', 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } xgb_trainer_task = { TaskSpecSchema.task_id: MortgageTaskNames.xgb_trainer_task_name, TaskSpecSchema.node_type: 'XgbMortgageTrainer', TaskSpecSchema.conf: { 'delete_dataframes': False, 'xgb_gpu_params': xgb_gpu_params }, TaskSpecSchema.inputs: [MortgageTaskNames.mortgage_workflow_runner_task_name], TaskSpecSchema.filepath: mortgage_lib_module } task_spec_list = [mortgage_workflow_runner_task, xgb_trainer_task] task_graph = TaskGraph(task_spec_list) # out_list = [MortgageTaskNames.mortgage_workflow_runner_task_name] # ((mortgage_feat_df_pandas, delinq_df_pandas),) = task_graph.run(out_list) out_list = [MortgageTaskNames.xgb_trainer_task_name] (bst, ) = task_graph.run(out_list) print('XGBOOST BOOSTER:\n', bst)
def main(): memory_limit = 128e9 threads_per_worker = 4 cluster = LocalCUDACluster(memory_limit=memory_limit, threads_per_worker=threads_per_worker) client = Client(cluster) sched_info = client.scheduler_info() print('CLIENT: {}'.format(client)) print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2))) # Importing here in case RMM is used later on. Must start client prior # to importing cudf stuff if using RMM. from gquant.dataframe_flow import (TaskSpecSchema, TaskGraph) # workers_names = \ # [iw['name'] for iw in client.scheduler_info()['workers'].values()] # nworkers = len(workers_names) _basedir = os.path.dirname(__file__) # mortgage_data_path = '/datasets/rapids_data/mortgage' mortgage_data_path = os.path.join(_basedir, 'mortgage_data') # Using some default csv files for testing. # csvfile_names = os.path.join(mortgage_data_path, 'names.csv') # acq_data_path = os.path.join(mortgage_data_path, 'acq') # perf_data_path = os.path.join(mortgage_data_path, 'perf') # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt') # csvfile_perfdata = \ # os.path.join(perf_data_path, 'Performance_2000Q1.txt_0') # mortgage_etl_workflow_def( # csvfile_names, csvfile_acqdata, csvfile_perfdata) gquant_task_spec_list = mortgage_etl_workflow_def() start_year = 2000 end_year = 2001 # end_year is inclusive # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against # create_dmatrix_serially - When False on same node if not enough host RAM # then it's a race condition when creating the dmatrix. Make sure enough # host RAM otherwise set to True. # create_dmatrix_serially = False # able to do 18 with create_dmatrix_serially set to True part_count = 18 # the number of data files to train against create_dmatrix_serially = True # part_count = 4 # the number of data files to train against # Use RAPIDS Memory Manager. Seems to work fine without it. use_rmm = False # Clean up intermediate dataframes in the xgboost training task. delete_dataframes = True mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list( mortgage_data_path, start_year, end_year, part_count, gquant_task_spec_list) _basedir = os.path.dirname(__file__) mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py') filter_dask_logger = False mortgage_workflow_runner_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_mortgage_workflow_runner_task_name, TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner', TaskSpecSchema.conf: { 'mortgage_run_params_dict_list': mortgage_run_params_dict_list, 'client': client, 'use_rmm': use_rmm, 'filter_dask_logger': filter_dask_logger, }, TaskSpecSchema.inputs: [], TaskSpecSchema.filepath: mortgage_lib_module } dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', # 'objective': 'gpu:reg:linear', 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } dxgb_trainer_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_xgb_trainer_task_name, TaskSpecSchema.node_type: 'DaskXgbMortgageTrainer', TaskSpecSchema.conf: { 'create_dmatrix_serially': create_dmatrix_serially, 'delete_dataframes': delete_dataframes, 'dxgb_gpu_params': dxgb_gpu_params, 'client': client, 'filter_dask_logger': filter_dask_logger }, TaskSpecSchema.inputs: [MortgageTaskNames.dask_mortgage_workflow_runner_task_name], TaskSpecSchema.filepath: mortgage_lib_module } task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task] out_list = [MortgageTaskNames.dask_xgb_trainer_task_name] task_graph = TaskGraph(task_spec_list) (bst, ) = task_graph.run(out_list) print('XGBOOST BOOSTER:\n', bst)