def process(self, inputs): input_meta = self.get_input_meta() predict_col = self.conf.get('prediction', 'predict') data_df = inputs[self.INPUT_PORT_NAME] if self.INPUT_PORT_MODEL_NAME in input_meta: # use external information instead of conf filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME]) train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train'] train_cols = list(train_cols.keys()) else: # use the conf information filename = get_file_path(self.conf['file']) if 'columns' in self.conf: if self.conf.get('include', True): train_cols = self.conf['columns'] else: train_cols = [ col for col in data_df.columns if col not in self.conf['columns'] ] train_cols.sort() fm = ForestInference.load(filename, model_type=self.conf.get( "model_type", "xgboost")) prediction = fm.predict(data_df[train_cols]) prediction.index = data_df.index data_df[predict_col] = prediction return {self.OUTPUT_PORT_NAME: data_df}
def _compute_hash_key(self): """ if hash changed, the port_setup, meta_setup and conf_json should be different In very rara case, might have the problem of hash collision, It affects the column, port and conf calculation. It won't change the computation result though. It returns the hash code, the loaded task_graph, the replacement conf obj """ task_graph = "" inputs = () replacementObj = {} input_node = "" task_graph_obj = None if 'taskgraph' in self.conf: task_graph = get_file_path(self.conf['taskgraph']) if os.path.exists(task_graph): with open(task_graph) as f: task_graph = hashlib.md5(f.read().encode()).hexdigest() task_graph_obj = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) self.update_replace(replacementObj, task_graph_obj) if 'input' in self.conf: for inp in self.conf['input']: input_node += inp+"," if hasattr(self, 'inputs'): for i in self.inputs: inputs += (hash(i['from_node']), i['to_port'], i['from_port']) return (hash((self.uid, task_graph, inputs, json.dumps(self.conf), input_node, json.dumps(replacementObj))), task_graph_obj, replacementObj)
def ports_setup(self): cache_key = self._compute_hash_key() if cache_key in cache_ports: # print('cache hit') return cache_ports[cache_key] inports = {} outports = {} if 'taskgraph' in self.conf: task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) replacementObj = {} self.update_replace(replacementObj) task_graph.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): inport = {} before_fix = inputNode.ports_setup().inports for key in before_fix.keys(): if key in in_ports: inport[key] = before_fix[key] inports.update(fix_port_name(inport, inputNode.uid)) def outNode_fun(outNode, out_ports): ouport = {} before_fix = outNode.ports_setup().outports for key in before_fix.keys(): if key in out_ports: ouport[key] = before_fix[key] outports.update(fix_port_name(ouport, outNode.uid)) self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) output_port = NodePorts(inports=inports, outports=outports) cache_ports[cache_key] = output_port return output_port
def process(self, inputs): """ Load the end of day stock CSV data into cuDF dataframe Arguments ------- inputs: list empty list Returns ------- cudf.DataFrame """ output = {} if self.outport_connected(CUDF_PORT_NAME): path = get_file_path(self.conf['file']) df = cudf.read_csv(path) # extract the year, month, day ymd = df['DTE'].astype( 'str').str.extract(r'(\d\d\d\d)(\d\d)(\d\d)') # construct the standard datetime str df['DTE'] = ymd[0].str.cat( ymd[1], '-').str.cat(ymd[2], '-').astype('datetime64[ms]') df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']] df['VOLUME'] /= 1000 # change the names df.columns = ['datetime', 'open', 'close', 'high', 'low', "asset", 'volume'] output.update({CUDF_PORT_NAME: df}) if self.outport_connected(PANDAS_PORT_NAME): path = get_file_path(self.conf['file']) df = pd.read_csv(path, converters={'DTE': lambda x: pd.Timestamp(str(x))}) df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']] df['VOLUME'] /= 1000 df.columns = ['datetime', 'open', 'close', 'high', 'low', "asset", 'volume'] output.update({PANDAS_PORT_NAME: df}) if self.outport_connected(DASK_CUDF_PORT_NAME): path = get_file_path(self.conf['path']) df = dask_cudf.read_csv(path+'/*.csv', parse_dates=['datetime']) output.update({DASK_CUDF_PORT_NAME: df}) return output
def init(self, class_obj): if nemo.core.NeuralModuleFactory.get_default_factory() is None: nemo.core.NeuralModuleFactory() self.instanceClass = class_obj self.instance = None self.file_fields = [] conf_para = get_conf_parameters(class_obj) self.fix_type = {} self.INPUT_NM = 'in_nm' self.OUTPUT_NM = 'out_nm' for key in conf_para.keys(): if key.find('name') >= 0: self.fix_type[key] = "string" if key.find('model') >= 0: self.fix_type[key] = "string" if key.find('file') >= 0: self.file_fields.append(key) for f in self.file_fields: self.fix_type[f] = 'string' if f in self.conf and self.conf[f]: self.conf[f] = get_file_path(self.conf[f]) if not issubclass(class_obj, DataLayerNM): try: if issubclass(self.instanceClass, TrainableNM): input_meta = self.get_input_meta() if self.INPUT_NM in input_meta: if (share_weight in self.conf and self.conf[share_weight] == 'Reuse'): self.conf = input_meta[self.INPUT_NM] app = nemo.utils.app_state.AppState() ins = None for mod in app._module_registry: if isinstance(mod, self.instanceClass): ins = mod break if ins is None: ins = class_obj(**self.conf) if self.instance is None: self.instance = ins except Exception as e: print(e) pass
def columns_setup(self): self.required = {} column_types = {"asset": "int64", "asset_name": "object"} out_cols = { STOCK_NAME_PORT_NAME: column_types, } if self.outport_connected(STOCK_MAP_PORT_NAME): if 'file' in self.conf: hash_key = self._compute_hash_key() if hash_key in cache_columns: out_cols.update( {STOCK_MAP_PORT_NAME: cache_columns[hash_key]}) else: path = get_file_path(self.conf['file']) name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']] name_df.columns = ["asset", 'asset_name'] pdf = name_df.to_pandas() column_data = pdf.to_dict('list') cache_columns[hash_key] = column_data out_cols.update({STOCK_MAP_PORT_NAME: column_data}) return out_cols
def process(self, inputs): """ Load the csv file mapping stock id to symbol name into cudf DataFrame Arguments ------- inputs: list empty list Returns ------- cudf.DataFrame """ output = {} if self.outport_connected(STOCK_NAME_PORT_NAME): path = get_file_path(self.conf['file']) name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']] # change the names name_df.columns = ["asset", 'asset_name'] output.update({STOCK_NAME_PORT_NAME: name_df}) if self.outport_connected(STOCK_MAP_PORT_NAME): output.update({STOCK_MAP_PORT_NAME: StockMap()}) return output
def columns_setup(self): cache_key = self._compute_hash_key() if cache_key in cache_columns: # print('cache hit') return cache_columns[cache_key] required = {} out_columns = {} if 'taskgraph' in self.conf: task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) replacementObj = {} self.update_replace(replacementObj) task_graph.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): req = {} # do columns_setup so required columns are ready inputNode.columns_setup() for key in inputNode.required.keys(): if key in in_ports: req[key] = inputNode.required[key] required.update(fix_port_name(req, inputNode.uid)) def outNode_fun(outNode, out_ports): oucols = {} before_fix = outNode.columns_setup() for key in before_fix.keys(): if key in out_ports: oucols[key] = before_fix[key] out_columns.update(fix_port_name(oucols, outNode.uid)) self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) self.required = required cache_columns[cache_key] = out_columns return out_columns
def process(self, inputs): """ Composite computation Arguments ------- inputs: list list of input dataframes. Returns ------- dataframe """ if 'taskgraph' in self.conf: task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) task_graph.build() outputLists = [] replaceObj = {} input_feeders = [] def inputNode_fun(inputNode, in_ports): inports = inputNode.ports_setup().inports class InputFeed(Node): def meta_setup(self): output = {} for inp in inputNode.inputs: output[inp['to_port']] = inp[ 'from_node'].meta_setup().outports[ inp['from_port']] # it will be something like { input_port: columns } return MetaData(inports={}, outports=output) def ports_setup(self): # it will be something like { input_port: types } return NodePorts(inports={}, outports=inports) def conf_schema(self): return ConfSchema() def process(self, empty): output = {} for key in inports.keys(): if inputNode.uid+'@'+key in inputs: output[key] = inputs[inputNode.uid+'@'+key] return output uni_id = str(uuid.uuid1()) obj = { TaskSpecSchema.task_id: uni_id, TaskSpecSchema.conf: {}, TaskSpecSchema.node_type: InputFeed, TaskSpecSchema.inputs: [] } input_feeders.append(obj) newInputs = {} for key in inports.keys(): if inputNode.uid+'@'+key in inputs: newInputs[key] = uni_id+'.'+key for inp in inputNode.inputs: if inp['to_port'] not in in_ports: # need to keep the old connections newInputs[inp['to_port']] = (inp['from_node'].uid + '.' + inp['from_port']) replaceObj.update({inputNode.uid: { TaskSpecSchema.inputs: newInputs} }) def outNode_fun(outNode, out_ports): out_ports = outNode.ports_setup().outports # fixed_outports = fix_port_name(out_ports, outNode.uid) for key in out_ports.keys(): if self.outport_connected(outNode.uid+'@'+key): outputLists.append(outNode.uid+'.'+key) self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) task_graph.extend(input_feeders) self.update_replace(replaceObj, task_graph) result = task_graph.run(outputLists, replace=replaceObj) output = {} for key in result.get_keys(): splits = key.split('.') output['@'.join(splits)] = result[key] return output else: return {}
def search_fun(config, checkpoint_dir=None): myinputs = {} for key in data_store.keys(): v = ray.get(data_store[key]) if isinstance(v, pandas.DataFrame): myinputs[key] = cudf.from_pandas(v) else: myinputs[key] = v task_graph = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) task_graph.build() outputLists = [train_id + '.' + 'checkpoint_dir'] replaceObj = {} input_feeders = [] def inputNode_fun(inputNode, in_ports): inports = inputNode.ports_setup().inports class InputFeed(Node): def meta_setup(self): output = {} for inp in inputNode.inputs: output[inp['to_port']] = inp[ 'from_node'].meta_setup()[inp['from_port']] # it will be something like { input_port: columns } return output def ports_setup(self): # it will be something like { input_port: types } return NodePorts(inports={}, outports=inports) def conf_schema(self): return ConfSchema() def process(self, empty): output = {} for key in inports.keys(): if (inputNode.uid + '@' + key in myinputs): output[key] = myinputs[inputNode.uid + '@' + key] return output uni_id = str(uuid.uuid1()) obj = { TaskSpecSchema.task_id: uni_id, TaskSpecSchema.conf: {}, TaskSpecSchema.node_type: InputFeed, TaskSpecSchema.inputs: [] } input_feeders.append(obj) newInputs = {} for key in inports.keys(): if inputNode.uid + '@' + key in myinputs: newInputs[key] = uni_id + '.' + key for inp in inputNode.inputs: if inp['to_port'] not in in_ports: # need to keep the old connections newInputs[inp['to_port']] = (inp['from_node'].uid + '.' + inp['from_port']) replaceObj.update( {inputNode.uid: { TaskSpecSchema.inputs: newInputs }}) def outNode_fun(outNode, out_ports): pass self._make_sub_graph_connection(task_graph, inputNode_fun, outNode_fun) task_graph.extend(input_feeders) self.update_conf_for_search(replaceObj, task_graph, config) task_graph.run(outputLists, replace=replaceObj)
def conf_schema(self): cache_key = self._compute_hash_key() if cache_key in cache_schema: # print('cache hit') return cache_schema[cache_key] json = { "title": "Composite Node configure", "type": "object", "description": """Use a sub taskgraph as a composite node""", "properties": { "taskgraph": { "type": "string", "description": "the taskgraph filepath" }, "input": { "type": "array", "description": "the input node ids", "items": { "type": "string" } }, "output": { "type": "array", "description": "the output node ids", "items": { "type": "string" } }, "subnode_ids": { "title": self.uid + " subnode ids", "type": "array", "items": { "type": "string" }, "description": """sub graph node ids that need to be reconfigured""" }, "subnodes_conf": { "title": self.uid + " subnodes configuration", "type": "object", "properties": {} } }, "required": ["taskgraph"], } ui = { "taskgraph": { "ui:widget": "TaskgraphSelector" }, "subnodes_conf": {} } if 'taskgraph' in self.conf: task_graphh = TaskGraph.load_taskgraph( get_file_path(self.conf['taskgraph'])) replacementObj = {} self.update_replace(replacementObj) task_graphh.build(replace=replacementObj) def inputNode_fun(inputNode, in_ports): pass def outNode_fun(outNode, out_ports): pass self._make_sub_graph_connection(task_graphh, inputNode_fun, outNode_fun) ids_in_graph = [] in_ports = [] out_ports = [] for t in task_graphh: node_id = t.get('id') if node_id != '': node = task_graphh[node_id] all_ports = node.ports_setup() for port in all_ports.inports.keys(): in_ports.append(node_id + '.' + port) for port in all_ports.outports.keys(): out_ports.append(node_id + '.' + port) ids_in_graph.append(node_id) json['properties']['input']['items']['enum'] = in_ports json['properties']['output']['items']['enum'] = out_ports json['properties']['subnode_ids']['items']['enum'] = ids_in_graph if 'subnode_ids' in self.conf: for subnodeId in self.conf['subnode_ids']: if subnodeId in task_graphh: nodeObj = task_graphh[subnodeId] schema = nodeObj.conf_schema() json['properties']["subnodes_conf"]['properties'][ subnodeId] = { "type": "object", "properties": { "conf": schema.json } } ui["subnodes_conf"].update( {subnodeId: { 'conf': schema.ui }}) out_schema = ConfSchema(json=json, ui=ui) cache_schema[cache_key] = out_schema return out_schema