def mortgage_greenflow_run(run_params_dict):
    '''Using dataframe-flow runs the tasks/workflow specified in the
    run_params_dict. Expected run_params_dict ex:
        run_params_dict = {
            'replace_spec': replace_spec,
            'task_spec_list': greenflow_task_spec_list,
            'out_list': out_list
        }

    greenflow_task_spec_list - Mortgage ETL workflow list of task-specs. Refer to
        module mortgage_common function mortgage_etl_workflow_def.

    out_list - Expected to specify one output which should be the final
        dataframe produced by the mortgage ETL workflow.

    :param run_params_dict: Dictionary with parameters and greenflow task list to
        run mortgage workflow.

    '''
    from greenflow.dataframe_flow import TaskGraph

    task_spec_list = run_params_dict['task_spec_list']
    out_list = run_params_dict['out_list']

    replace_spec = run_params_dict['replace_spec']
    task_graph = TaskGraph(task_spec_list)

    (final_perf_acq_df, ) = task_graph.run(out_list, replace_spec)

    return final_perf_acq_df
    def test_columns_and_ports_types_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
    def test_columns_name_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'rangenums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" missing required column "list" from '\
            '"numgen.numlist".'
        self.assertIn(errmsg, outerr_msg)
    def test_load_workflow(self):
        '''Test loading a workflow from yaml:'''
        from greenflow.dataframe_flow import TaskGraph
        workflow_file = os.path.join(self._test_dir, 'test_save_workflow.yaml')

        with open(workflow_file, 'w') as wf:
            wf.write(WORKFLOW_YAML)

        task_list = TaskGraph.load_taskgraph(workflow_file)
        all_tasks_exist = True
        for t in task_list:
            match = False
            if t._task_spec in self._task_list:
                match = True
            if not match:
                all_tasks_exist = False
                break
        with StringIO() as yf:
            yaml.dump(self._task_list,
                      yf,
                      default_flow_style=False,
                      sort_keys=False)
            yf.seek(0)

            err_msg = 'Load workflow failed. Missing expected task items.\n'\
                'EXPECTED WORKFLOW YAML:\n\n'\
                '{wyaml}\n\n'\
                'GOT TASKS FORMATTED AS YAML:\n\n'\
                '{tlist}\n\n'.format(wyaml=WORKFLOW_YAML, tlist=yf.read())

            self.assertTrue(all_tasks_exist, err_msg)
    def test_ports_connection_subclass_type_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'port_type': MyList,
            'columns_option': 'mylistnums'
        }
        numproc_spec[TaskSpecSchema.conf] = {'port_type': list}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
示例#6
0
    def test_load(self):
        '''Test that a taskgraph can be loaded from a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_load_taskgraph.yaml')

        global TASKGRAPH_YAML
        with open(workflow_file, 'w') as wf:
            wf.write(TASKGRAPH_YAML)

        tspec_list = [task._task_spec for task in self.tgraph]

        tgraph = TaskGraph.load_taskgraph(workflow_file)
        all_tasks_exist = True
        for task in tgraph:
            if task._task_spec not in tspec_list:
                all_tasks_exist = False
                break

        with StringIO() as yf:
            yaml.dump(tspec_list,
                      yf,
                      default_flow_style=False,
                      sort_keys=False)
            yf.seek(0)

            err_msg = 'Load taskgraph failed. Missing expected task items.\n'\
                'EXPECTED TASKGRAPH YAML:\n\n'\
                '{wyaml}\n\n'\
                'GOT TASKS FORMATTED AS YAML:\n\n'\
                '{tlist}\n\n'.format(wyaml=TASKGRAPH_YAML, tlist=yf.read())

            self.assertTrue(all_tasks_exist, err_msg)
    def test_columns_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnotnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" column "list" expected type "numbers" got '\
            'type "notnumbers" instead.'
        self.assertIn(errmsg, outerr_msg)
示例#8
0
def get_nodes_from_file(file):
    """
    Given an input yaml file string. It returns a dict which has two keys.
        nodes:
            - list of node objects for the UI client. It contains all the
            necessary information about the node including the size of the node
            input ports, output ports, output meta names/types,
            conf schema and conf data.
        edges:
            - list of edge objects for the UI client. It enumerate all the
            edges in the graph.

    Arguments
    -------
    file: string
        file name

    Returns
    -------
    dict
        nodes and edges of the graph data

    """
    task_graph = TaskGraph.load_taskgraph(file)
    return get_nodes(task_graph)
    def test_ports_connection_subclass_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}
        numproc_spec[TaskSpecSchema.conf] = {'port_type': MyList}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Connected nodes do not have matching port types. '\
            'Fix port types.'
        self.assertIn(errmsg, outerr_msg)
示例#10
0
 def post(self):
     # input_data is a dictionnary with a key "name"
     input_data = self.get_json_body()
     task_graph = TaskGraph(input_data)
     # import pudb
     # pudb.set_trace()
     nodes_and_edges = get_nodes(task_graph)
     self.finish(json.dumps(nodes_and_edges))
    def test_ports_output_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'columns_option': 'listnums',
            'out_type': 'rangenums'
        }

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Node "numgen" output port "numlist" produced wrong type '\
            '"<class \'range\'>". Expected type "[<class \'list\'>]"'
        self.assertEqual(errmsg, outerr_msg)
示例#12
0
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate the dataframe flow graph')
    parser.add_argument('-t', '--task', help="the yaml task file")
    parser.add_argument('output', help="the output nodes", nargs='+')
    args = parser.parse_args()
    import pudb
    pudb.set_trace()

    task_graph = TaskGraph.load_workflow(args.task)
    print('output nodes:', args.output)
    task_graph.run(args.output)
    def test_save_workflow(self):
        '''Test saving a workflow to yaml:'''
        from greenflow.dataframe_flow import TaskGraph
        task_graph = TaskGraph(self._task_list)
        workflow_file = os.path.join(self._test_dir, 'test_save_workflow.yaml')
        task_graph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        cdiff = list(context_diff(WORKFLOW_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Workflow yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=WORKFLOW_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)
示例#14
0
    def setUp(self):
        import gc  # python garbage collector

        # warmup
        s = pd.Series([1, 2, 3, None, 4])
        del (s)
        gc.collect()

        os.environ['GREENFLOW_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes'

        points_task_spec = {
            TaskSpecSchema.task_id: 'points_task',
            TaskSpecSchema.node_type: 'PointNode',
            TaskSpecSchema.conf: {
                'npts': 1000
            },
            TaskSpecSchema.inputs: []
        }

        distance_task_spec = {
            TaskSpecSchema.task_id: 'distance_by_df',
            TaskSpecSchema.node_type: 'DistanceNode',
            TaskSpecSchema.conf: {},
            TaskSpecSchema.inputs: {
                'points_df_in': 'points_task.points_df_out'
            }
        }

        tspec_list = [points_task_spec, distance_task_spec]

        self.tgraph = TaskGraph(tspec_list)

        # Create a temporary directory
        self._test_dir = tempfile.mkdtemp()
        os.environ['GREENFLOW_CACHE_DIR'] = os.path.join(
            self._test_dir, '.cache')
示例#15
0
 def _compute_hash_key(self):
     """
     if hash changed, the port_setup, meta_setup
     and conf_json should be different
     In very rara case, might have the problem of hash collision,
     It affects the column, port and conf calculation. It won't
     change the computation result though.
     It returns the hash code, the loaded task_graph,
     the replacement conf obj
     """
     task_graph = ""
     inputs = ()
     replacementObj = {}
     input_node = ""
     task_graph_obj = None
     if 'taskgraph' in self.conf:
         try:
             task_graph = get_file_path(self.conf['taskgraph'])
         except FileNotFoundError:
             task_graph = None
         if task_graph is not None and os.path.exists(task_graph):
             with open(task_graph) as f:
                 task_graph = hashlib.md5(f.read().encode()).hexdigest()
             task_graph_obj = TaskGraph.load_taskgraph(
                 get_file_path(self.conf['taskgraph']))
     self.update_replace(replacementObj, task_graph_obj)
     if 'input' in self.conf:
         for inp in self.conf['input']:
             input_node += inp+","
             if hasattr(self, 'inputs'):
                 for i in self.inputs:
                     inputs += (hash(i['from_node']),
                                i['to_port'], i['from_port'])
     return (hash((self.uid, task_graph, inputs, json.dumps(self.conf),
                   input_node, json.dumps(replacementObj))), task_graph_obj,
             replacementObj)
示例#16
0
def main():

    memory_limit = 128e9
    threads_per_worker = 4
    cluster = LocalCUDACluster(memory_limit=memory_limit,
                               threads_per_worker=threads_per_worker)
    client = Client(cluster)
    sched_info = client.scheduler_info()

    print('CLIENT: {}'.format(client))
    print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2)))

    # Importing here in case RMM is used later on. Must start client prior
    # to importing cudf stuff if using RMM.
    from greenflow.dataframe_flow import (TaskSpecSchema, TaskGraph)

    # workers_names = \
    #     [iw['name'] for iw in client.scheduler_info()['workers'].values()]
    # nworkers = len(workers_names)

    _basedir = os.path.dirname(__file__)
    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    greenflow_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against

    # create_dmatrix_serially - When False on same node if not enough host RAM
    # then it's a race condition when creating the dmatrix. Make sure enough
    # host RAM otherwise set to True.
    # create_dmatrix_serially = False

    # able to do 18 with create_dmatrix_serially set to True
    part_count = 18  # the number of data files to train against
    create_dmatrix_serially = True
    # part_count = 4  # the number of data files to train against

    # Use RAPIDS Memory Manager. Seems to work fine without it.
    use_rmm = False

    # Clean up intermediate dataframes in the xgboost training task.
    delete_dataframes = True

    mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        greenflow_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir,
                                       'mortgage_greenflow_plugins.py')

    filter_dask_logger = False

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list,
            'client': client,
            'use_rmm': use_rmm,
            'filter_dask_logger': filter_dask_logger,
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    dxgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': 1,
        'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    dxgb_trainer_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_xgb_trainer_task_name,
        TaskSpecSchema.node_type:
        'DaskXgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'create_dmatrix_serially': create_dmatrix_serially,
            'delete_dataframes': delete_dataframes,
            'dxgb_gpu_params': dxgb_gpu_params,
            'client': client,
            'filter_dask_logger': filter_dask_logger
        },
        TaskSpecSchema.inputs:
        [MortgageTaskNames.dask_mortgage_workflow_runner_task_name],
        TaskSpecSchema.filepath:
        mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task]

    out_list = [MortgageTaskNames.dask_xgb_trainer_task_name]
    task_graph = TaskGraph(task_spec_list)
    (bst, ) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)
示例#17
0
 def post(self):
     # input_data is a dictionnary with a key "name"
     input_data = self.get_json_body()
     task_graph = TaskGraph.load_taskgraph(input_data['path'])
     nodes_and_edges = get_nodes(task_graph)
     self.finish(json.dumps(nodes_and_edges))
示例#18
0
class TestTaskGraphAPI(unittest.TestCase):
    def setUp(self):
        import gc  # python garbage collector

        # warmup
        s = pd.Series([1, 2, 3, None, 4])
        del (s)
        gc.collect()

        os.environ['GREENFLOW_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes'

        points_task_spec = {
            TaskSpecSchema.task_id: 'points_task',
            TaskSpecSchema.node_type: 'PointNode',
            TaskSpecSchema.conf: {
                'npts': 1000
            },
            TaskSpecSchema.inputs: []
        }

        distance_task_spec = {
            TaskSpecSchema.task_id: 'distance_by_df',
            TaskSpecSchema.node_type: 'DistanceNode',
            TaskSpecSchema.conf: {},
            TaskSpecSchema.inputs: {
                'points_df_in': 'points_task.points_df_out'
            }
        }

        tspec_list = [points_task_spec, distance_task_spec]

        self.tgraph = TaskGraph(tspec_list)

        # Create a temporary directory
        self._test_dir = tempfile.mkdtemp()
        os.environ['GREENFLOW_CACHE_DIR'] = os.path.join(
            self._test_dir, '.cache')

    def tearDown(self):
        global DEFAULT_MODULE
        os.environ['GREENFLOW_PLUGIN_MODULE'] = DEFAULT_MODULE
        os.environ['GREENFLOW_CACHE_DIR'] = Node.cache_dir
        shutil.rmtree(self._test_dir)

    @ordered
    def test_viz_graph(self):
        '''Test taskgraph to networkx graph conversion for graph visualization.
        '''
        nx_graph = self.tgraph.viz_graph(show_ports=True)
        nx_nodes = [
            'points_task', 'points_task.points_df_out', 'distance_by_df',
            'distance_by_df.distance_df', 'distance_by_df.distance_abs_df'
        ]
        nx_edges = [('points_task', 'points_task.points_df_out'),
                    ('points_task.points_df_out', 'distance_by_df'),
                    ('distance_by_df', 'distance_by_df.distance_df'),
                    ('distance_by_df', 'distance_by_df.distance_abs_df')]
        self.assertEqual(list(nx_graph.nodes), nx_nodes)
        self.assertEqual(list(nx_graph.edges), nx_edges)

    @ordered
    def test_build(self):
        '''Test build of a taskgraph and that all inputs and outputs are set
        for the tasks withink a taskgraph.
        '''
        self.tgraph.build()

        points_node = self.tgraph['points_task']
        distance_node = self.tgraph['distance_by_df']

        onode_info = {
            'to_node': distance_node,
            'to_port': 'points_df_in',
            'from_port': 'points_df_out'
        }
        self.assertIn(onode_info, points_node.outputs)

        onode_cols = {
            'points_df_out': {
                'x': 'float64',
                'y': 'float64'
            },
            'points_ddf_out': {
                'x': 'float64',
                'y': 'float64'
            }
        }
        self.assertEqual(onode_cols, points_node.meta_setup().outports)

        inode_info = {
            'from_node': points_node,
            'from_port': 'points_df_out',
            'to_port': 'points_df_in'
        }
        self.assertIn(inode_info, distance_node.inputs)

        inode_in_cols = {'points_df_in': {'x': 'float64', 'y': 'float64'}}
        self.assertEqual(inode_in_cols, distance_node.get_input_meta())

        inode_out_cols = {
            'distance_df': {
                'distance_df': 'float64',
                'x': 'float64',
                'y': 'float64'
            },
            'distance_abs_df': {
                'distance_abs_df': 'float64',
                'x': 'float64',
                'y': 'float64'
            }
        }
        self.assertEqual(inode_out_cols, distance_node.meta_setup().outports)

    @ordered
    def test_run(self):
        '''Test that a taskgraph can run successfully.
        '''
        outlist = ['distance_by_df.distance_df']
        # Using numpy random seed to get repeatable and deterministic results.
        # For seed 2335 should get something around 761.062831178.
        replace_spec = {
            'points_task': {
                TaskSpecSchema.conf: {
                    'npts': 1000,
                    'nseed': 2335
                }
            }
        }
        (dist_df_w_df, ) = self.tgraph.run(outputs=outlist,
                                           replace=replace_spec)
        dist_sum = dist_df_w_df['distance_df'].sum()
        # self.assertAlmostEqual(dist_sum, 0.0, places, msg, delta)
        self.assertAlmostEqual(dist_sum, 761.062831178)  # match to 7 places

    @ordered
    def test_save(self):
        '''Test that a taskgraph can be save to a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_save_taskgraph.yaml')
        self.tgraph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        global TASKGRAPH_YAML
        cdiff = list(context_diff(TASKGRAPH_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Taskgraph yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=TASKGRAPH_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)

    @ordered
    def test_load(self):
        '''Test that a taskgraph can be loaded from a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_load_taskgraph.yaml')

        global TASKGRAPH_YAML
        with open(workflow_file, 'w') as wf:
            wf.write(TASKGRAPH_YAML)

        tspec_list = [task._task_spec for task in self.tgraph]

        tgraph = TaskGraph.load_taskgraph(workflow_file)
        all_tasks_exist = True
        for task in tgraph:
            if task._task_spec not in tspec_list:
                all_tasks_exist = False
                break

        with StringIO() as yf:
            yaml.dump(tspec_list,
                      yf,
                      default_flow_style=False,
                      sort_keys=False)
            yf.seek(0)

            err_msg = 'Load taskgraph failed. Missing expected task items.\n'\
                'EXPECTED TASKGRAPH YAML:\n\n'\
                '{wyaml}\n\n'\
                'GOT TASKS FORMATTED AS YAML:\n\n'\
                '{tlist}\n\n'.format(wyaml=TASKGRAPH_YAML, tlist=yf.read())

            self.assertTrue(all_tasks_exist, err_msg)

    @ordered
    def test_save_load_cache(self):
        '''Test caching of tasks outputs within a taskgraph.

            1. Save points_task output to cache when running the taskgraph.
            2. Load points_task df from cache when running the taskgraph.
        '''
        replace_spec = {'points_task': {TaskSpecSchema.save: True}}
        outlist = ['distance_by_df.distance_df']

        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to write HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to write HDF dataset',
                category=UserWarning,
            )
            # ignore RuntimeWarning: numpy.ufunc size changed
            warnings.filterwarnings('ignore',
                                    category=RuntimeWarning,
                                    message='numpy.ufunc size changed')
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)

        cache_dir = os.path.join(self._test_dir, '.cache', 'points_task.hdf5')
        self.assertTrue(os.path.exists(cache_dir))

        replace_spec = {'points_task': {TaskSpecSchema.load: True}}
        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to read HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to read HDF dataset',
                category=UserWarning)
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)
def main():
    _basedir = os.path.dirname(__file__)

    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    greenflow_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against
    part_count = 12  # the number of data files to train against
    # part_count = 4  # the number of data files to train against

    mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        greenflow_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir, 'mortgage_greenflow_plugins.py')

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
            MortgageTaskNames.mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'MortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    # Can be multi-gpu. Set ngpus > 1. This is different than dask xgboost
    # which is distributed multi-gpu i.e. dask-xgboost could distribute on one
    # node or multiple nodes. In distributed mode the dmatrix is disributed.
    ngpus = 1
    xgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2 ** 8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': ngpus,
        # 'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    xgb_trainer_task = {
        TaskSpecSchema.task_id: MortgageTaskNames.xgb_trainer_task_name,
        TaskSpecSchema.node_type: 'XgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'delete_dataframes': False,
            'xgb_gpu_params': xgb_gpu_params
        },
        TaskSpecSchema.inputs: [
            MortgageTaskNames.mortgage_workflow_runner_task_name
        ],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, xgb_trainer_task]
    task_graph = TaskGraph(task_spec_list)

    # out_list = [MortgageTaskNames.mortgage_workflow_runner_task_name]
    # ((mortgage_feat_df_pandas, delinq_df_pandas),) = task_graph.run(out_list)

    out_list = [MortgageTaskNames.xgb_trainer_task_name]
    (bst,) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)
示例#20
0
            def search_fun(config, checkpoint_dir=None):
                myinputs = {}
                for key in data_store.keys():
                    v = ray.get(data_store[key])
                    if isinstance(v, pandas.DataFrame):
                        myinputs[key] = cudf.from_pandas(v)
                    else:
                        myinputs[key] = v
                task_graph = TaskGraph.load_taskgraph(
                    get_file_path(self.conf['taskgraph']))
                task_graph.build()

                outputLists = [train_id + '.' + 'checkpoint_dir']
                replaceObj = {}
                input_feeders = []

                def inputNode_fun(inputNode, in_ports):
                    inports = inputNode.ports_setup().inports

                    class InputFeed(Node):
                        def meta_setup(self):
                            output = {}
                            for inp in inputNode.inputs:
                                output[inp['to_port']] = inp[
                                    'from_node'].meta_setup()[inp['from_port']]
                            # it will be something like { input_port: columns }
                            return output

                        def ports_setup(self):
                            # it will be something like { input_port: types }
                            return NodePorts(inports={}, outports=inports)

                        def conf_schema(self):
                            return ConfSchema()

                        def process(self, empty):
                            output = {}
                            for key in inports.keys():
                                if (inputNode.uid + '@' + key in myinputs):
                                    output[key] = myinputs[inputNode.uid +
                                                           '@' + key]
                            return output

                    uni_id = str(uuid.uuid1())
                    obj = {
                        TaskSpecSchema.task_id: uni_id,
                        TaskSpecSchema.conf: {},
                        TaskSpecSchema.node_type: InputFeed,
                        TaskSpecSchema.inputs: []
                    }
                    input_feeders.append(obj)
                    newInputs = {}
                    for key in inports.keys():
                        if inputNode.uid + '@' + key in myinputs:
                            newInputs[key] = uni_id + '.' + key
                    for inp in inputNode.inputs:
                        if inp['to_port'] not in in_ports:
                            # need to keep the old connections
                            newInputs[inp['to_port']] = (inp['from_node'].uid +
                                                         '.' +
                                                         inp['from_port'])
                    replaceObj.update(
                        {inputNode.uid: {
                            TaskSpecSchema.inputs: newInputs
                        }})

                def outNode_fun(outNode, out_ports):
                    pass

                self._make_sub_graph_connection(task_graph, inputNode_fun,
                                                outNode_fun)

                task_graph.extend(input_feeders)
                self.update_conf_for_search(replaceObj, task_graph, config)
                task_graph.run(outputLists, replace=replaceObj)
示例#21
0
    def update(self):
        TemplateNodeMixin.update(self)
        self.conf_update()  # update the conf
        task_graph = ""
        replacementObj = {}
        task_graph_obj = None
        if 'taskgraph' in self.conf:
            try:
                task_graph = get_file_path(self.conf['taskgraph'])
            except FileNotFoundError:
                task_graph = None
            if task_graph is not None and os.path.exists(task_graph):
                # with open(task_graph) as f:
                #     task_graph = hashlib.md5(f.read().encode()).hexdigest()
                task_graph_obj = TaskGraph.load_taskgraph(
                    get_file_path(self.conf['taskgraph']))
        self.all_inputs = []
        self.all_outputs = []
        self.task_graph = task_graph_obj
        self.update_replace(replacementObj, task_graph_obj)
        self.replacementObj = replacementObj
        extra_updated = set()
        extra_roots = []
        if self.task_graph is not None:
            self.task_graph._build(replace=self.replacementObj)
            if 'input' in self.conf:
                # group input ports by node id
                self.inp_groups = group_ports(self.conf['input'])
                for inp in self.inp_groups.keys():
                    if inp in self.task_graph:
                        inputNode = self.task_graph[inp]
                        update_inputs = []
                        replaced_ports = set(self.inp_groups[inp])
                        for oldInput in inputNode.inputs:
                            if oldInput['to_port'] in replaced_ports:
                                # we want to disconnect this old one and
                                # connect to external node
                                if hasattr(self, 'inputs'):
                                    for externalInput in self.inputs:
                                        if (_get_node(externalInput['to_port'])
                                                == inputNode.uid and _get_port(
                                                    externalInput['to_port'])
                                                == oldInput['to_port']):
                                            newInput = {}
                                            newInput['to_port'] = _get_port(
                                                externalInput['to_port'])
                                            newInput[
                                                'from_port'] = externalInput[
                                                    'from_port']
                                            newInput[
                                                'from_node'] = externalInput[
                                                    'from_node']
                                            update_inputs.append(newInput)
                            else:
                                update_inputs.append(oldInput)
                        inputNode.inputs = update_inputs

                        # add all the `updated` parents to the set
                        for i in inputNode.inputs:
                            if hasattr(i['from_node'], 'ports_setup_cache'):
                                extra_updated.add(i['from_node'])
                        # if all the parents are updated, this is
                        # a new root node
                        if all([
                                i['from_node'] in extra_updated
                                for i in inputNode.inputs
                        ]):
                            extra_roots.append(inputNode)

                        self.all_inputs.append((inputNode, inp))

            if 'output' in self.conf:
                self.oup_groups = group_ports(self.conf['output'])
                for oup in self.oup_groups.keys():
                    if oup in self.task_graph:
                        outNode = self.task_graph[oup]
                        # we do not disconnect anything here, as we take extra
                        # outputs for composite node.
                        # Node, we rely on the fact that taskgraph.run method
                        # will remove the output collector from taskgraph if
                        # the outputlist is set
                        self.all_outputs.append((outNode, oup))
                        # outNode_fun(outNode, oup_groups[oup])

            # update all the nodes and cache it
            self.task_graph.breadth_first_update(extra_roots=extra_roots,
                                                 extra_updated=extra_updated)
示例#22
0
    def process(self, inputs):
        """
        Composite computation

        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """
        if 'taskgraph' in self.conf:
            task_graph = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            task_graph.build()

            outputLists = []
            replaceObj = {}
            input_feeders = []

            def inputNode_fun(inputNode, in_ports):
                inports = inputNode.ports_setup().inports

                class InputFeed(Node):

                    def meta_setup(self):
                        output = {}
                        for inp in inputNode.inputs:
                            output[inp['to_port']] = inp[
                                'from_node'].meta_setup().outports[
                                    inp['from_port']]
                        # it will be something like { input_port: columns }
                        return MetaData(inports={}, outports=output)

                    def ports_setup(self):
                        # it will be something like { input_port: types }
                        return NodePorts(inports={}, outports=inports)

                    def conf_schema(self):
                        return ConfSchema()

                    def process(self, empty):
                        output = {}
                        for key in inports.keys():
                            if inputNode.uid+'@'+key in inputs:
                                output[key] = inputs[inputNode.uid+'@'+key]
                        return output

                uni_id = str(uuid.uuid1())
                obj = {
                    TaskSpecSchema.task_id: uni_id,
                    TaskSpecSchema.conf: {},
                    TaskSpecSchema.node_type: InputFeed,
                    TaskSpecSchema.inputs: []
                }
                input_feeders.append(obj)
                newInputs = {}
                for key in inports.keys():
                    if inputNode.uid+'@'+key in inputs:
                        newInputs[key] = uni_id+'.'+key
                for inp in inputNode.inputs:
                    if inp['to_port'] not in in_ports:
                        # need to keep the old connections
                        newInputs[inp['to_port']] = (inp['from_node'].uid
                                                     + '.' + inp['from_port'])
                replaceObj.update({inputNode.uid: {
                    TaskSpecSchema.inputs: newInputs}
                })

            def outNode_fun(outNode, out_ports):
                out_ports = outNode.ports_setup().outports
                # fixed_outports = fix_port_name(out_ports, outNode.uid)
                for key in out_ports.keys():
                    if self.outport_connected(outNode.uid+'@'+key):
                        outputLists.append(outNode.uid+'.'+key)

            self._make_sub_graph_connection(task_graph,
                                            inputNode_fun, outNode_fun)

            task_graph.extend(input_feeders)
            self.update_replace(replaceObj, task_graph)
            result = task_graph.run(outputLists, replace=replaceObj)
            output = {}
            for key in result.get_keys():
                splits = key.split('.')
                output['@'.join(splits)] = result[key]
            return output
        else:
            return {}