예제 #1
0
def mortgage_gquant_run(run_params_dict):
    '''Using dataframe-flow runs the tasks/workflow specified in the
    run_params_dict. Expected run_params_dict ex:
        run_params_dict = {
            'replace_spec': replace_spec,
            'task_spec_list': gquant_task_spec_list,
            'out_list': out_list
        }

    gquant_task_spec_list - Mortgage ETL workflow list of task-specs. Refer to
        module mortgage_common function mortgage_etl_workflow_def.

    out_list - Expected to specify one output which should be the final
        dataframe produced by the mortgage ETL workflow.

    :param run_params_dict: Dictionary with parameters and gquant task list to
        run mortgage workflow.

    '''
    from gquant.dataframe_flow import TaskGraph

    task_spec_list = run_params_dict['task_spec_list']
    out_list = run_params_dict['out_list']

    replace_spec = run_params_dict['replace_spec']
    task_graph = TaskGraph(task_spec_list)

    (final_perf_acq_df, ) = task_graph.run(out_list, replace_spec)

    return final_perf_acq_df
예제 #2
0
    def setUp(self):
        import gc  # python garbage collector
        import cudf

        # warmup
        s = cudf.Series([1, 2, 3, None, 4], nan_as_null=False)
        del(s)
        gc.collect()

        os.environ['GQUANT_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes'

        points_task_spec = {
            TaskSpecSchema.task_id: 'points_task',
            TaskSpecSchema.node_type: 'PointNode',
            TaskSpecSchema.conf: {'npts': 1000},
            TaskSpecSchema.inputs: []
        }

        distance_task_spec = {
            TaskSpecSchema.task_id: 'distance_by_cudf',
            TaskSpecSchema.node_type: 'DistanceNode',
            TaskSpecSchema.conf: {},
            TaskSpecSchema.inputs: {
                'points_df_in': 'points_task.points_df_out'
            }
        }

        tspec_list = [points_task_spec, distance_task_spec]

        self.tgraph = TaskGraph(tspec_list)

        # Create a temporary directory
        self._test_dir = tempfile.mkdtemp()
        os.environ['GQUANT_CACHE_DIR'] = os.path.join(self._test_dir, '.cache')
예제 #3
0
파일: handlers.py 프로젝트: idanre1/gQuant
 def post(self):
     # input_data is a dictionnary with a key "name"
     input_data = self.get_json_body()
     task_graph = TaskGraph(input_data)
     # import pudb
     # pudb.set_trace()
     nodes_and_edges = get_nodes(task_graph)
     self.finish(json.dumps(nodes_and_edges))
    def test_columns_and_ports_types_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
    def test_ports_connection_subclass_type_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'port_type': MyList,
            'columns_option': 'mylistnums'
        }
        numproc_spec[TaskSpecSchema.conf] = {'port_type': list}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
    def test_columns_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnotnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" column "list" expected type "numbers" got '\
            'type "notnumbers" instead.'
        self.assertIn(errmsg, outerr_msg)
    def test_columns_name_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'rangenums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" missing required column "list" from '\
            '"numgen.numlist".'
        self.assertIn(errmsg, outerr_msg)
    def test_ports_connection_subclass_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}
        numproc_spec[TaskSpecSchema.conf] = {'port_type': MyList}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Connected nodes do not have matching port types. '\
            'Fix port types.'
        self.assertIn(errmsg, outerr_msg)
    def test_ports_output_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'columns_option': 'listnums',
            'out_type': 'rangenums'
        }

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Node "numgen" output port "numlist" produced wrong type '\
            '"<class \'range\'>". Expected type "[<class \'list\'>]"'
        self.assertEqual(errmsg, outerr_msg)
    def test_save_workflow(self):
        '''Test saving a workflow to yaml:'''
        from gquant.dataframe_flow import TaskGraph
        task_graph = TaskGraph(self._task_list)
        workflow_file = os.path.join(self._test_dir, 'test_save_workflow.yaml')
        task_graph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        cdiff = list(context_diff(WORKFLOW_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Workflow yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=WORKFLOW_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)
예제 #11
0
def main():
    _basedir = os.path.dirname(__file__)

    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    gquant_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against
    part_count = 12  # the number of data files to train against
    # part_count = 4  # the number of data files to train against

    mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        gquant_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py')

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'MortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    # Can be multi-gpu. Set ngpus > 1. This is different than dask xgboost
    # which is distributed multi-gpu i.e. dask-xgboost could distribute on one
    # node or multiple nodes. In distributed mode the dmatrix is disributed.
    ngpus = 1
    xgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': ngpus,
        # 'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    xgb_trainer_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.xgb_trainer_task_name,
        TaskSpecSchema.node_type:
        'XgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'delete_dataframes': False,
            'xgb_gpu_params': xgb_gpu_params
        },
        TaskSpecSchema.inputs:
        [MortgageTaskNames.mortgage_workflow_runner_task_name],
        TaskSpecSchema.filepath:
        mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, xgb_trainer_task]
    task_graph = TaskGraph(task_spec_list)

    # out_list = [MortgageTaskNames.mortgage_workflow_runner_task_name]
    # ((mortgage_feat_df_pandas, delinq_df_pandas),) = task_graph.run(out_list)

    out_list = [MortgageTaskNames.xgb_trainer_task_name]
    (bst, ) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)
예제 #12
0
def main():

    memory_limit = 128e9
    threads_per_worker = 4
    cluster = LocalCUDACluster(memory_limit=memory_limit,
                               threads_per_worker=threads_per_worker)
    client = Client(cluster)
    sched_info = client.scheduler_info()

    print('CLIENT: {}'.format(client))
    print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2)))

    # Importing here in case RMM is used later on. Must start client prior
    # to importing cudf stuff if using RMM.
    from gquant.dataframe_flow import (TaskSpecSchema, TaskGraph)

    # workers_names = \
    #     [iw['name'] for iw in client.scheduler_info()['workers'].values()]
    # nworkers = len(workers_names)

    _basedir = os.path.dirname(__file__)
    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    gquant_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against

    # create_dmatrix_serially - When False on same node if not enough host RAM
    # then it's a race condition when creating the dmatrix. Make sure enough
    # host RAM otherwise set to True.
    # create_dmatrix_serially = False

    # able to do 18 with create_dmatrix_serially set to True
    part_count = 18  # the number of data files to train against
    create_dmatrix_serially = True
    # part_count = 4  # the number of data files to train against

    # Use RAPIDS Memory Manager. Seems to work fine without it.
    use_rmm = False

    # Clean up intermediate dataframes in the xgboost training task.
    delete_dataframes = True

    mortgage_run_params_dict_list = generate_mortgage_gquant_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        gquant_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir, 'mortgage_gquant_plugins.py')

    filter_dask_logger = False

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list,
            'client': client,
            'use_rmm': use_rmm,
            'filter_dask_logger': filter_dask_logger,
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    dxgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': 1,
        'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    dxgb_trainer_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_xgb_trainer_task_name,
        TaskSpecSchema.node_type:
        'DaskXgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'create_dmatrix_serially': create_dmatrix_serially,
            'delete_dataframes': delete_dataframes,
            'dxgb_gpu_params': dxgb_gpu_params,
            'client': client,
            'filter_dask_logger': filter_dask_logger
        },
        TaskSpecSchema.inputs:
        [MortgageTaskNames.dask_mortgage_workflow_runner_task_name],
        TaskSpecSchema.filepath:
        mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task]

    out_list = [MortgageTaskNames.dask_xgb_trainer_task_name]
    task_graph = TaskGraph(task_spec_list)
    (bst, ) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)