def process(self, inputs): input_meta = self.get_input_meta() predict_col = self.conf.get('prediction', 'predict') data_df = inputs[self.INPUT_PORT_NAME] if self.INPUT_PORT_MODEL_NAME in input_meta: # use external information instead of conf filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME]) train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train'] train_cols = list(train_cols.keys()) else: # use the conf information filename = get_file_path(self.conf['file']) if 'columns' in self.conf: if self.conf.get('include', True): train_cols = self.conf['columns'] else: train_cols = [ col for col in data_df.columns if col not in self.conf['columns'] ] # train_cols.sort() fm = ForestInference.load(filename, model_type=self.conf.get( "model_type", "xgboost")) prediction = fm.predict(data_df[train_cols]) prediction.index = data_df.index data_df[predict_col] = prediction return {self.OUTPUT_PORT_NAME: data_df}
def process(self, inputs): """ dump the input datafram to the resulting csv file. the output filepath is defined as `path` in the `conf`. if only a subset of columns is needed for the csv file, enumerate the columns in the `columns` of the `conf` Arguments ------- inputs: list list of input dataframes. Returns ------- dataframe """ raw_input_df = inputs[self.INPUT_PORT_NAME] if 'columns' in self.conf: raw_input_df = raw_input_df[self.conf['columns']] if isinstance(raw_input_df, dask_cudf.DataFrame): input_df = raw_input_df.compute() # get the computed value else: input_df = raw_input_df input_df.to_pandas().to_csv(get_file_path(self.conf['path']), index=False) return {self.OUTPUT_PORT_NAME: raw_input_df}
def process(self, inputs): """ Load the end of day stock CSV data into cuDF dataframe Arguments ------- inputs: list empty list Returns ------- cudf.DataFrame """ output = {} if self.outport_connected(CUDF_PORT_NAME): path = get_file_path(self.conf['file']) df = cudf.read_csv(path) # extract the year, month, day ymd = df['DTE'].astype( 'str').str.extract(r'(\d\d\d\d)(\d\d)(\d\d)') # construct the standard datetime str df['DTE'] = ymd[0].str.cat( ymd[1], '-').str.cat(ymd[2], '-').astype('datetime64[ms]') df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']] df['VOLUME'] /= 1000 # change the names df.columns = ['datetime', 'open', 'close', 'high', 'low', "asset", 'volume'] output.update({CUDF_PORT_NAME: df}) if self.outport_connected(PANDAS_PORT_NAME): path = get_file_path(self.conf['file']) df = pd.read_csv(path, converters={'DTE': lambda x: pd.Timestamp(str(x))}) df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']] df['VOLUME'] /= 1000 df.columns = ['datetime', 'open', 'close', 'high', 'low', "asset", 'volume'] output.update({PANDAS_PORT_NAME: df}) if self.outport_connected(DASK_CUDF_PORT_NAME): path = get_file_path(self.conf['path']) df = dask_cudf.read_csv(path+'/*.csv', parse_dates=['datetime']) output.update({DASK_CUDF_PORT_NAME: df}) return output
def _compute_hash_key(self): """ if hash changed, the port_setup, meta_setup and conf_json should be different In very rara case, might have the problem of hash collision, It affects the column, port and conf calculation. It won't change the computation result though. It returns the hash code, the loaded task_graph, the replacement conf obj """ task_graph = "" inputs = () replacementObj = {} input_node = "" task_graph_obj = None if 'taskgraph' in self.conf: try: task_graph = get_file_path(self.conf['taskgraph']) except FileNotFoundError: task_graph = None if task_graph is not None and os.path.exists(task_graph): with open(task_graph) as f: task_graph = hashlib.md5(f.read().encode()).hexdigest() task_graph_obj = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) self.update_replace(replacementObj, task_graph_obj) if 'input' in self.conf: for inp in self.conf['input']: input_node += inp+"," if hasattr(self, 'inputs'): for i in self.inputs: inputs += (hash(i['from_node']), i['to_port'], i['from_port']) return (hash((self.uid, task_graph, inputs, json.dumps(self.conf), input_node, json.dumps(replacementObj))), task_graph_obj, replacementObj)
def process(self, inputs): import dask.distributed try: client = dask.distributed.client.default_client() except ValueError: from dask_cuda import LocalCUDACluster cluster = LocalCUDACluster() from dask.distributed import Client client = Client(cluster) # noqa print('start new Cluster') filename = get_file_path(self.conf['csvfile']) df = cudf.read_csv(filename, parse_dates=[0]) df.columns = ['date'] + [c for c in df.columns][1:] output = {} if self.outport_connected('df_out'): output.update({'df_out': df}) return output
def process(self, inputs): """ dump the model into the file Arguments ------- inputs: list list of input dataframes. Returns ------- dataframe """ model = inputs[self.INPUT_PORT_NAME] if isinstance(model, dict): model = model['booster'] pathname = get_file_path(self.conf['path']) model.save_model(pathname) return {self.OUTPUT_PORT_NAME: pathname}
def init(self, class_obj): if nemo.core.NeuralModuleFactory.get_default_factory() is None: nemo.core.NeuralModuleFactory() self.instanceClass = class_obj self.instance = None self.file_fields = [] conf_para = get_conf_parameters(class_obj) self.fix_type = {} self.INPUT_NM = 'in_nm' self.OUTPUT_NM = 'out_nm' for key in conf_para.keys(): if key.find('name') >= 0: self.fix_type[key] = "string" if key.find('model') >= 0: self.fix_type[key] = "string" if key.find('file') >= 0: self.file_fields.append(key) for f in self.file_fields: self.fix_type[f] = 'string' if f in self.conf and self.conf[f]: self.conf[f] = get_file_path(self.conf[f]) if not issubclass(class_obj, DataLayerNM): try: if issubclass(self.instanceClass, TrainableNM): input_meta = self.get_input_meta() if self.INPUT_NM in input_meta: if (share_weight in self.conf and self.conf[share_weight] == 'Reuse'): self.conf = input_meta[self.INPUT_NM] app = nemo.utils.app_state.AppState() ins = None for mod in app._module_registry: if isinstance(mod, self.instanceClass): ins = mod break if ins is None: ins = class_obj(**self.conf) if self.instance is None: self.instance = ins except Exception as e: print(e) pass
def process(self, inputs): """ Load the csv file mapping stock id to symbol name into cudf DataFrame Arguments ------- inputs: list empty list Returns ------- cudf.DataFrame """ output = {} if self.outport_connected(STOCK_NAME_PORT_NAME): path = get_file_path(self.conf['file']) name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']] # change the names name_df.columns = ["asset", 'asset_name'] output.update({STOCK_NAME_PORT_NAME: name_df}) if self.outport_connected(STOCK_MAP_PORT_NAME): output.update({STOCK_MAP_PORT_NAME: StockMap()}) return output
def meta_setup(self): required = {} column_types = {"asset": "int64", "asset_name": "object"} out_cols = { STOCK_NAME_PORT_NAME: column_types, } if self.outport_connected(STOCK_MAP_PORT_NAME): if 'file' in self.conf: hash_key = self._compute_hash_key() if hash_key in CACHE_NAME: out_cols.update( {STOCK_MAP_PORT_NAME: CACHE_NAME[hash_key]}) else: path = get_file_path(self.conf['file']) name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']] name_df.columns = ["asset", 'asset_name'] pdf = name_df.to_pandas() column_data = pdf.to_dict('list') CACHE_NAME[hash_key] = column_data out_cols.update({STOCK_MAP_PORT_NAME: column_data}) metadata = MetaData(inports=required, outports=out_cols) return metadata
def process(self, inputs): """ Composite computation Arguments ------- inputs: list list of input dataframes. Returns ------- dataframe """ if 'taskgraph' in self.conf: task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) task_graph.build() outputLists = [] replaceObj = {} input_feeders = [] def inputNode_fun(inputNode, in_ports): inports = inputNode.ports_setup().inports class InputFeed(Node): def meta_setup(self): output = {} for inp in inputNode.inputs: output[inp['to_port']] = inp[ 'from_node'].meta_setup().outports[ inp['from_port']] # it will be something like { input_port: columns } return MetaData(inports={}, outports=output) def ports_setup(self): # it will be something like { input_port: types } return NodePorts(inports={}, outports=inports) def conf_schema(self): return ConfSchema() def process(self, empty): output = {} for key in inports.keys(): if inputNode.uid+'@'+key in inputs: output[key] = inputs[inputNode.uid+'@'+key] return output uni_id = str(uuid.uuid1()) obj = { TaskSpecSchema.task_id: uni_id, TaskSpecSchema.conf: {}, TaskSpecSchema.node_type: InputFeed, TaskSpecSchema.inputs: [] } input_feeders.append(obj) newInputs = {} for key in inports.keys(): if inputNode.uid+'@'+key in inputs: newInputs[key] = uni_id+'.'+key for inp in inputNode.inputs: if inp['to_port'] not in in_ports: # need to keep the old connections newInputs[inp['to_port']] = (inp['from_node'].uid + '.' + inp['from_port']) replaceObj.update({inputNode.uid: { TaskSpecSchema.inputs: newInputs} }) def outNode_fun(outNode, out_ports): out_ports = outNode.ports_setup().outports # fixed_outports = fix_port_name(out_ports, outNode.uid) for key in out_ports.keys(): if self.outport_connected(outNode.uid+'@'+key): outputLists.append(outNode.uid+'.'+key) self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) task_graph.extend(input_feeders) self.update_replace(replaceObj, task_graph) result = task_graph.run(outputLists, replace=replaceObj) output = {} for key in result.get_keys(): splits = key.split('.') output['@'.join(splits)] = result[key] return output else: return {}
def update(self): TemplateNodeMixin.update(self) self.conf_update() # update the conf task_graph = "" replacementObj = {} task_graph_obj = None if 'taskgraph' in self.conf: try: task_graph = get_file_path(self.conf['taskgraph']) except FileNotFoundError: task_graph = None if task_graph is not None and os.path.exists(task_graph): # with open(task_graph) as f: # task_graph = hashlib.md5(f.read().encode()).hexdigest() task_graph_obj = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) self.all_inputs = [] self.all_outputs = [] self.task_graph = task_graph_obj self.update_replace(replacementObj, task_graph_obj) self.replacementObj = replacementObj extra_updated = set() extra_roots = [] if self.task_graph is not None: self.task_graph._build(replace=self.replacementObj) if 'input' in self.conf: # group input ports by node id self.inp_groups = group_ports(self.conf['input']) for inp in self.inp_groups.keys(): if inp in self.task_graph: inputNode = self.task_graph[inp] update_inputs = [] replaced_ports = set(self.inp_groups[inp]) for oldInput in inputNode.inputs: if oldInput['to_port'] in replaced_ports: # we want to disconnect this old one and # connect to external node if hasattr(self, 'inputs'): for externalInput in self.inputs: if (_get_node(externalInput['to_port']) == inputNode.uid and _get_port( externalInput['to_port']) == oldInput['to_port']): newInput = {} newInput['to_port'] = _get_port( externalInput['to_port']) newInput[ 'from_port'] = externalInput[ 'from_port'] newInput[ 'from_node'] = externalInput[ 'from_node'] update_inputs.append(newInput) else: update_inputs.append(oldInput) inputNode.inputs = update_inputs # add all the `updated` parents to the set for i in inputNode.inputs: if hasattr(i['from_node'], 'ports_setup_cache'): extra_updated.add(i['from_node']) # if all the parents are updated, this is # a new root node if all([ i['from_node'] in extra_updated for i in inputNode.inputs ]): extra_roots.append(inputNode) self.all_inputs.append((inputNode, inp)) if 'output' in self.conf: self.oup_groups = group_ports(self.conf['output']) for oup in self.oup_groups.keys(): if oup in self.task_graph: outNode = self.task_graph[oup] # we do not disconnect anything here, as we take extra # outputs for composite node. # Node, we rely on the fact that taskgraph.run method # will remove the output collector from taskgraph if # the outputlist is set self.all_outputs.append((outNode, oup)) # outNode_fun(outNode, oup_groups[oup]) # update all the nodes and cache it self.task_graph.breadth_first_update(extra_roots=extra_roots, extra_updated=extra_updated)
def search_fun(config, checkpoint_dir=None): myinputs = {} for key in data_store.keys(): v = ray.get(data_store[key]) if isinstance(v, pandas.DataFrame): myinputs[key] = cudf.from_pandas(v) else: myinputs[key] = v task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) task_graph.build() outputLists = [train_id + '.' + 'checkpoint_dir'] replaceObj = {} input_feeders = [] def inputNode_fun(inputNode, in_ports): inports = inputNode.ports_setup().inports class InputFeed(Node): def meta_setup(self): output = {} for inp in inputNode.inputs: output[inp['to_port']] = inp[ 'from_node'].meta_setup()[inp['from_port']] # it will be something like { input_port: columns } return output def ports_setup(self): # it will be something like { input_port: types } return NodePorts(inports={}, outports=inports) def conf_schema(self): return ConfSchema() def process(self, empty): output = {} for key in inports.keys(): if (inputNode.uid + '@' + key in myinputs): output[key] = myinputs[inputNode.uid + '@' + key] return output uni_id = str(uuid.uuid1()) obj = { TaskSpecSchema.task_id: uni_id, TaskSpecSchema.conf: {}, TaskSpecSchema.node_type: InputFeed, TaskSpecSchema.inputs: [] } input_feeders.append(obj) newInputs = {} for key in inports.keys(): if inputNode.uid + '@' + key in myinputs: newInputs[key] = uni_id + '.' + key for inp in inputNode.inputs: if inp['to_port'] not in in_ports: # need to keep the old connections newInputs[inp['to_port']] = (inp['from_node'].uid + '.' + inp['from_port']) replaceObj.update( {inputNode.uid: { TaskSpecSchema.inputs: newInputs }}) def outNode_fun(outNode, out_ports): pass self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) task_graph.extend(input_feeders) self.update_conf_for_search(replaceObj, task_graph, config) task_graph.run(outputLists, replace=replaceObj)