def mortgage_gquant_run(run_params_dict): '''Using dataframe-flow runs the tasks/workflow specified in the run_params_dict. Expected run_params_dict ex: run_params_dict = { 'replace_spec': replace_spec, 'task_spec_list': gquant_task_spec_list, 'out_list': out_list } gquant_task_spec_list - Mortgage ETL workflow list of task-specs. Refer to module mortgage_common function mortgage_etl_workflow_def. out_list - Expected to specify one output which should be the final dataframe produced by the mortgage ETL workflow. :param run_params_dict: Dictionary with parameters and gquant task list to run mortgage workflow. ''' from gquant.dataframe_flow import TaskGraph task_spec_list = run_params_dict['task_spec_list'] out_list = run_params_dict['out_list'] replace_spec = run_params_dict['replace_spec'] task_graph = TaskGraph(task_spec_list) (final_perf_acq_df, ) = task_graph.run(out_list, replace_spec) return final_perf_acq_df
def setUp(self): import gc # python garbage collector import cudf # warmup s = cudf.Series([1, 2, 3, None, 4], nan_as_null=False) del(s) gc.collect() os.environ['GQUANT_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes' points_task_spec = { TaskSpecSchema.task_id: 'points_task', TaskSpecSchema.node_type: 'PointNode', TaskSpecSchema.conf: {'npts': 1000}, TaskSpecSchema.inputs: [] } distance_task_spec = { TaskSpecSchema.task_id: 'distance_by_cudf', TaskSpecSchema.node_type: 'DistanceNode', TaskSpecSchema.conf: {}, TaskSpecSchema.inputs: { 'points_df_in': 'points_task.points_df_out' } } tspec_list = [points_task_spec, distance_task_spec] self.tgraph = TaskGraph(tspec_list) # Create a temporary directory self._test_dir = tempfile.mkdtemp() os.environ['GQUANT_CACHE_DIR'] = os.path.join(self._test_dir, '.cache')
def post(self): # input_data is a dictionnary with a key "name" input_data = self.get_json_body() task_graph = TaskGraph(input_data) # import pudb # pudb.set_trace() nodes_and_edges = get_nodes(task_graph) self.finish(json.dumps(nodes_and_edges))
def test_columns_and_ports_types_match(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'} tspec_list = [numgen_spec, numproc_spec] tgraph_valid = TaskGraph(tspec_list) sumout, = tgraph_valid.run(['numproc.sum']) self.assertEqual(sumout, 45)
def test_ports_connection_subclass_type_match(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = { 'port_type': MyList, 'columns_option': 'mylistnums' } numproc_spec[TaskSpecSchema.conf] = {'port_type': list} tspec_list = [numgen_spec, numproc_spec] tgraph_valid = TaskGraph(tspec_list) sumout, = tgraph_valid.run(['numproc.sum']) self.assertEqual(sumout, 45)
def test_columns_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnotnums'} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(LookupError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Task "numproc" column "list" expected type "numbers" got '\ 'type "notnumbers" instead.' self.assertIn(errmsg, outerr_msg)
def test_columns_name_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'rangenums'} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(LookupError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Task "numproc" missing required column "list" from '\ '"numgen.numlist".' self.assertIn(errmsg, outerr_msg)
def test_ports_connection_subclass_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'} numproc_spec[TaskSpecSchema.conf] = {'port_type': MyList} tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(TypeError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Connected nodes do not have matching port types. '\ 'Fix port types.' self.assertIn(errmsg, outerr_msg)
def test_ports_output_type_mismatch(self): numgen_spec = copy.deepcopy(self.numgen_spec) numproc_spec = copy.deepcopy(self.numproc_spec) numgen_spec[TaskSpecSchema.conf] = { 'columns_option': 'listnums', 'out_type': 'rangenums' } tspec_list = [numgen_spec, numproc_spec] tgraph_invalid = TaskGraph(tspec_list) with self.assertRaises(TypeError) as cm: tgraph_invalid.run(['numproc.sum']) outerr_msg = '{}'.format(cm.exception) errmsg = 'Node "numgen" output port "numlist" produced wrong type '\ '"<class \'range\'>". Expected type "[<class \'list\'>]"' self.assertEqual(errmsg, outerr_msg)
def test_save_workflow(self): '''Test saving a workflow to yaml:''' from gquant.dataframe_flow import TaskGraph task_graph = TaskGraph(self._task_list) workflow_file = os.path.join(self._test_dir, 'test_save_workflow.yaml') task_graph.save_taskgraph(workflow_file) with open(workflow_file) as wf: workflow_str = wf.read() # verify the workflow contentst same as expected. Empty list if same. cdiff = list(context_diff(WORKFLOW_YAML, workflow_str)) cdiff_empty = cdiff == [] err_msg = 'Workflow yaml contents do not match expected results.\n'\ 'SHOULD HAVE SAVED:\n\n'\ '{wyaml}\n\n'\ 'INSTEAD FILE CONTAINS:\n\n'\ '{fcont}\n\n'\ 'DIFF:\n\n'\ '{diff}'.format(wyaml=WORKFLOW_YAML, fcont=workflow_str, diff=''.join(cdiff)) self.assertTrue(cdiff_empty, err_msg)
def main(): _basedir = os.path.dirname(__file__) # mortgage_data_path = '/datasets/rapids_data/mortgage' mortgage_data_path = os.path.join(_basedir, 'mortgage_data') # Using some default csv files for testing. # csvfile_names = os.path.join(mortgage_data_path, 'names.csv') # acq_data_path = os.path.join(mortgage_data_path, 'acq') # perf_data_path = os.path.join(mortgage_data_path, 'perf') # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt') # csvfile_perfdata = \ # os.path.join(perf_data_path, 'Performance_2000Q1.txt_0') # mortgage_etl_workflow_def( # csvfile_names, csvfile_acqdata, csvfile_perfdata) gquant_task_spec_list = mortgage_etl_workflow_def() start_year = 2000 end_year = 2001 # end_year is inclusive # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against part_count = 12 # the number of data files to train against # part_count = 4 # the number of data files to train against mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list( mortgage_data_path, start_year, end_year, part_count, gquant_task_spec_list) _basedir = os.path.dirname(__file__) mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py') mortgage_workflow_runner_task = { TaskSpecSchema.task_id: MortgageTaskNames.mortgage_workflow_runner_task_name, TaskSpecSchema.node_type: 'MortgageWorkflowRunner', TaskSpecSchema.conf: { 'mortgage_run_params_dict_list': mortgage_run_params_dict_list }, TaskSpecSchema.inputs: [], TaskSpecSchema.filepath: mortgage_lib_module } # Can be multi-gpu. Set ngpus > 1. This is different than dask xgboost # which is distributed multi-gpu i.e. dask-xgboost could distribute on one # node or multiple nodes. In distributed mode the dmatrix is disributed. ngpus = 1 xgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': ngpus, # 'distributed_dask': True, 'loss': 'ls', # 'objective': 'gpu:reg:linear', 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } xgb_trainer_task = { TaskSpecSchema.task_id: MortgageTaskNames.xgb_trainer_task_name, TaskSpecSchema.node_type: 'XgbMortgageTrainer', TaskSpecSchema.conf: { 'delete_dataframes': False, 'xgb_gpu_params': xgb_gpu_params }, TaskSpecSchema.inputs: [MortgageTaskNames.mortgage_workflow_runner_task_name], TaskSpecSchema.filepath: mortgage_lib_module } task_spec_list = [mortgage_workflow_runner_task, xgb_trainer_task] task_graph = TaskGraph(task_spec_list) # out_list = [MortgageTaskNames.mortgage_workflow_runner_task_name] # ((mortgage_feat_df_pandas, delinq_df_pandas),) = task_graph.run(out_list) out_list = [MortgageTaskNames.xgb_trainer_task_name] (bst, ) = task_graph.run(out_list) print('XGBOOST BOOSTER:\n', bst)
def main(): memory_limit = 128e9 threads_per_worker = 4 cluster = LocalCUDACluster(memory_limit=memory_limit, threads_per_worker=threads_per_worker) client = Client(cluster) sched_info = client.scheduler_info() print('CLIENT: {}'.format(client)) print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2))) # Importing here in case RMM is used later on. Must start client prior # to importing cudf stuff if using RMM. from gquant.dataframe_flow import (TaskSpecSchema, TaskGraph) # workers_names = \ # [iw['name'] for iw in client.scheduler_info()['workers'].values()] # nworkers = len(workers_names) _basedir = os.path.dirname(__file__) # mortgage_data_path = '/datasets/rapids_data/mortgage' mortgage_data_path = os.path.join(_basedir, 'mortgage_data') # Using some default csv files for testing. # csvfile_names = os.path.join(mortgage_data_path, 'names.csv') # acq_data_path = os.path.join(mortgage_data_path, 'acq') # perf_data_path = os.path.join(mortgage_data_path, 'perf') # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt') # csvfile_perfdata = \ # os.path.join(perf_data_path, 'Performance_2000Q1.txt_0') # mortgage_etl_workflow_def( # csvfile_names, csvfile_acqdata, csvfile_perfdata) gquant_task_spec_list = mortgage_etl_workflow_def() start_year = 2000 end_year = 2001 # end_year is inclusive # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against # create_dmatrix_serially - When False on same node if not enough host RAM # then it's a race condition when creating the dmatrix. Make sure enough # host RAM otherwise set to True. # create_dmatrix_serially = False # able to do 18 with create_dmatrix_serially set to True part_count = 18 # the number of data files to train against create_dmatrix_serially = True # part_count = 4 # the number of data files to train against # Use RAPIDS Memory Manager. Seems to work fine without it. use_rmm = False # Clean up intermediate dataframes in the xgboost training task. delete_dataframes = True mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list( mortgage_data_path, start_year, end_year, part_count, gquant_task_spec_list) _basedir = os.path.dirname(__file__) mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py') filter_dask_logger = False mortgage_workflow_runner_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_mortgage_workflow_runner_task_name, TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner', TaskSpecSchema.conf: { 'mortgage_run_params_dict_list': mortgage_run_params_dict_list, 'client': client, 'use_rmm': use_rmm, 'filter_dask_logger': filter_dask_logger, }, TaskSpecSchema.inputs: [], TaskSpecSchema.filepath: mortgage_lib_module } dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', # 'objective': 'gpu:reg:linear', 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } dxgb_trainer_task = { TaskSpecSchema.task_id: MortgageTaskNames.dask_xgb_trainer_task_name, TaskSpecSchema.node_type: 'DaskXgbMortgageTrainer', TaskSpecSchema.conf: { 'create_dmatrix_serially': create_dmatrix_serially, 'delete_dataframes': delete_dataframes, 'dxgb_gpu_params': dxgb_gpu_params, 'client': client, 'filter_dask_logger': filter_dask_logger }, TaskSpecSchema.inputs: [MortgageTaskNames.dask_mortgage_workflow_runner_task_name], TaskSpecSchema.filepath: mortgage_lib_module } task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task] out_list = [MortgageTaskNames.dask_xgb_trainer_task_name] task_graph = TaskGraph(task_spec_list) (bst, ) = task_graph.run(out_list) print('XGBOOST BOOSTER:\n', bst)