def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ wf_net_conf = WorkFlowNetConfWdnn(key) self.model_path = wf_net_conf.model_path self.hidden_layers = wf_net_conf.hidden_layers self.activation_function = wf_net_conf.activation_function self.batch_size = wf_net_conf.batch_size self.epoch = wf_net_conf.epoch self.model_type = wf_net_conf.model_type self.train = wf_net_conf.train self.auto_demension = wf_net_conf.auto_demension #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values _wf_data_node = wf_data_node( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag self.predict_path = _wf_data_node.predict_path
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ wf_net_conf = WorkFlowNetConfWdnn(key) self.wf_state_id = wf_net_conf.get_state_id(key).pk netconfig = wf_net_conf.get_view_obj(key) self.model_path = wf_net_conf.model_path self.hidden_layers = wf_net_conf.hidden_layers self.activation_function = wf_net_conf.activation_function self.batch_size = wf_net_conf.batch_size self.epoch = wf_net_conf.epoch self.model_type = wf_net_conf.model_type #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf(key.split('_')[0]+'_'+key.split('_')[1]+'_'+'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values if 'test' in self.get_prev_node()[0].node_name: _wf_data_node = wf_data_node(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag else: _wf_data_node = wf_data_node(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag
def make_label_values(self, _data_dfconf_list, _df_csv_read): """ label의 Unique Value를 DataConf에 넣어줌 Args: params: * _data_dfconf_list : nnid의 wf정보 * _df_csv_read : Dataframe(train, eval) Returns: _label : label 항목 값 _labe_type : label type """ _key = _data_dfconf_list _nnid = _key.split('_')[0] _ver = _key.split('_')[1] _node = 'dataconf_node' _wf_data_conf = wf_data_conf(_key) if hasattr(_wf_data_conf, 'label') == True: _label = _wf_data_conf.label _labe_type = _wf_data_conf.label_type origin_labels_list = _wf_data_conf.label_values if hasattr( _wf_data_conf, 'label_values') else list() # 처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 compare_labels_list = self.set_dataconf_for_labels( _df_csv_read, _label) self.combined_label_list = utils.get_combine_label_list( origin_labels_list, compare_labels_list) # 리스트를 합친다음 DB에 업데이트 한다. _data_conf = dict() _data_conf['label_values'] = self.combined_label_list if _labe_type == 'CONTINUOUS': _data_conf['label_values'] = list() _wf_data_conf.put_step_source(_nnid, _ver, _node, _data_conf) return _label, _labe_type
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ #key = conf_data. wf_net_conf = WorkFlowNetConfWdnn(key) #self.model_path = utils.get_model_path(nnid, ver, netconf_node) self.model_path = wf_net_conf.model_path self.hidden_layers = wf_net_conf.hidden_layers self.activation_function = wf_net_conf.activation_function self.batch_size = wf_net_conf.batch_size self.epoch = wf_net_conf.epoch self.model_type = wf_net_conf.model_type self.train = wf_net_conf.train self.auto_demension = wf_net_conf.auto_demension self.optimizer_type = wf_net_conf.optimizer_type self.learning_rates = wf_net_conf.learning_rates #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf(key.split('_')[0]+'_'+key.split('_')[1]+'_'+'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values _wf_data_node = wf_data_node(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag self.predict_path = _wf_data_node.predict_path
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ wf_net_conf = WorkFlowNetConfWdnn(key) self.wf_state_id = wf_net_conf.get_state_id(key).pk netconfig = wf_net_conf.get_view_obj(key) self.model_path = wf_net_conf.model_path self.hidden_layers = wf_net_conf.hidden_layers self.activation_function = wf_net_conf.activation_function self.batch_size = wf_net_conf.batch_size self.epoch = wf_net_conf.epoch self.model_type = wf_net_conf.model_type #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values if 'test' in self.get_prev_node()[0].node_name: _wf_data_node = wf_data_node( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag else: _wf_data_node = wf_data_node( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag
def make_column_types(self, df, node_id, data_dfconf_list): """ csv를 읽고 column type을 계산하여 data_conf에 저장(data_conf가 비어있을때 ) :param df: :param conf_data: """ try: data_conf, data_conf_unique_json = self.set_dataconf_for_checktype( df, node_id, data_dfconf_list) data_conf_unique_cnt = self.make_unique_value_each_column( df, node_id) data_conf.update(data_conf_unique_cnt) dataconf_nodes = self._get_forward_node_with_type( node_id, 'dataconf') wf_data_conf_node = wf_data_conf(data_dfconf_list) if self.dataconf_first_time_check(wf_data_conf_node, node_id): self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf) self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_cnt) self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_json) if self.dataconf_eval_time_check(wf_data_conf_node, node_id): self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_json) return data_conf except Exception as e: logging.info("make column type Error {0} line no({1})".format( e, e.__traceback__.tb_lineno)) raise Exception(e)
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ wf_net_conf = WorkFlowNetConfML(key) self.model_path = wf_net_conf.model_path self.ml_class = wf_net_conf.ml_class self.config = wf_net_conf.config self.batch_size = 10000 self.model_type = wf_net_conf.model_type #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values _wf_data_node = wf_data_node( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag self.predict_path = _wf_data_node.predict_path
def make_label_values(self, _data_dfconf_list, _df_csv_read): """ label의 Unique Value를 DataConf에 넣어줌 Args: params: * _data_dfconf_list : nnid의 wf정보 * _df_csv_read : Dataframe(train, eval) Returns: _label : label 항목 값 _labe_type : label type """ _key = _data_dfconf_list _nnid = _key.split('_')[0] _ver = _key.split('_')[1] _node = 'dataconf_node' _wf_data_conf = wf_data_conf(_key) if hasattr(_wf_data_conf, 'label') == True: _label = _wf_data_conf.label _labe_type = _wf_data_conf.label_type origin_labels_list = _wf_data_conf.label_values if hasattr(_wf_data_conf, 'label_values') else list() # 처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 compare_labels_list = self.set_dataconf_for_labels(_df_csv_read, _label) self.combined_label_list = utils.get_combine_label_list(origin_labels_list, compare_labels_list) # 리스트를 합친다음 DB에 업데이트 한다. _data_conf = dict() _data_conf['label_values'] = self.combined_label_list if _labe_type == 'CONTINUOUS': _data_conf['label_values'] = list() _wf_data_conf.put_step_source(_nnid, _ver, _node, _data_conf) return _label, _labe_type
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ try: _wf_data_conf = wf_data_conf(key) self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values self.label_type = _wf_data_conf.label_type if hasattr(self, "node_name"): #bugfix node_name이 없는 경우 에러 안나게 처리 if 'test' in self.__dict__.get("node_name"): _wf_data_conf = wf_data_frame( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'evaldata') self.multi_node_flag = _wf_data_conf.multi_node_flag else: _wf_data_conf = wf_data_frame( key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_node_flag = _wf_data_conf.multi_node_flag except Exception as e: raise Exception("WorkFlowDataFrame parms are not set " + str(e))
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list): """ csv를 읽고 column type을 계산하여 data_conf에 저장(data_conf가 비어있을때 ) 카테고리 컬럼은 Unique 한 값을 구해서 cell_feature_unique에 넣어줌(Keras용) :param wf_data_config, df, nnid, ver, node: :param conf_data: """ try: #TODO : set_default_dataconf_from_csv 파라미터 정리 필요 data_conf = dict() data_conf_unique_v = dict() data_conf_col_unique_v = dict() data_conf_col_type = dict() numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # Wdnn인경우 data_dfconf가 무조껀 한개만 존재 하므로 아래와 같은 로직이 가능 if len(data_dfconf_list) > 0: _wf_data_conf = wf_data_conf(data_dfconf_list) #_cell_feature = _wf_data_conf.cell_feature if hasattr(_wf_data_conf,'cell_feature') else list() #처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf, 'cell_feature_unique') else list() # 처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 for i, v in df.dtypes.iteritems(): # label column_dtypes = dict() column_unique_value = dict() col_type = '' if (str(v) in numerics): # maybe need float col_type = 'CONTINUOUS' columns_unique_value = list() else: col_type = 'CATEGORICAL' #columns_unique_value = pd.unique(df[i].values.ravel()).tolist() # null처리 해야함 columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist() # null처리 해야함 column_dtypes['column_type'] = col_type #원래 가지고 있던 카테고리 컬럼별 유일한 값 origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list() combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value) #읽어와서 추가되면 뒤에 붙여준다. column_unique_value['column_u_values'] = combined_col_u_list data_conf_col_type[i] = column_dtypes data_conf_col_unique_v[i] = column_unique_value data_conf['cell_feature'] = data_conf_col_type data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v #json으로 바꿔줌 data_conf_json_str = json.dumps(data_conf) data_conf_json = json.loads(data_conf_json_str) data_conf_unique_json_str = json.dumps(data_conf_unique_v) data_conf_unique_json = json.loads(data_conf_unique_json_str) return data_conf_json, data_conf_unique_json except Exception as e: logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list): """ csv를 읽고 column type을 계산하여 data_conf에 저장(data_conf가 비어있을때 ) 카테고리 컬럼은 Unique 한 값을 구해서 cell_feature_unique에 넣어줌(Keras용) :param wf_data_config, df, nnid, ver, node: :param conf_data: """ try: #TODO : set_default_dataconf_from_csv 파라미터 정리 필요 data_conf = dict() data_conf_unique_v = dict() data_conf_col_unique_v = dict() data_conf_col_type = dict() numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # Wdnn인경우 data_dfconf가 무조껀 한개만 존재 하므로 아래와 같은 로직이 가능 if len(data_dfconf_list) > 0: _wf_data_conf = wf_data_conf(data_dfconf_list) _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf, 'cell_feature_unique') else list() # 처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 for i, v in df.dtypes.iteritems(): # label column_dtypes = dict() column_unique_value = dict() if (str(v) in numerics): # maybe need float col_type = 'CONTINUOUS' columns_unique_value = list() else: col_type = 'CATEGORICAL' columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist() # null처리 해야함 column_dtypes['column_type'] = col_type origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list() combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value) column_unique_value['column_u_values'] = combined_col_u_list #읽어와서 추가되면 뒤에 붙여준다. data_conf_col_type[i] = column_dtypes data_conf_col_unique_v[i] = column_unique_value data_conf['cell_feature'] = data_conf_col_type data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v data_conf_json_str = json.dumps(data_conf) #Json으로 바꿔줌 data_conf_json = json.loads(data_conf_json_str) data_conf_unique_json_str = json.dumps(data_conf_unique_v) data_conf_unique_json = json.loads(data_conf_unique_json_str) return data_conf_json, data_conf_unique_json except Exception as e: logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))
def make_column_types (self, df, node_id, data_dfconf_list): """ csv를 읽고 column type을 계산하여 data_conf에 저장(data_conf가 비어있을때 ) :param df: :param conf_data: """ try: data_conf, data_conf_unique_json =self.set_dataconf_for_checktype(df, node_id, data_dfconf_list ) data_conf_unique_cnt = self.make_unique_value_each_column(df,node_id) data_conf.update(data_conf_unique_cnt) dataconf_nodes = self._get_forward_node_with_type(node_id, 'dataconf') wf_data_conf_node = wf_data_conf(data_dfconf_list) if self.dataconf_first_time_check(wf_data_conf_node, node_id): self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf) self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_cnt) self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_json) if self.dataconf_eval_time_check(wf_data_conf_node, node_id): self.set_default_dataconf_from_csv(wf_data_conf_node, node_id, data_conf_unique_json) return data_conf except Exception as e: logging.info("make column type Error {0} line no({1})".format(e, e.__traceback__.tb_lineno)) raise Exception(e)
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ try : _wf_data_conf = wf_data_conf(key) self.label = _wf_data_conf.label self.cell_feature= _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values self.label_type = _wf_data_conf.label_type if hasattr(self, "node_name"): #bugfix node_name이 없는 경우 에러 안나게 처리 if 'test' in self.__dict__.get("node_name"): _wf_data_conf = wf_data_frame(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'evaldata') self.multi_node_flag = _wf_data_conf.multi_node_flag else : _wf_data_conf = wf_data_frame(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_node_flag = _wf_data_conf.multi_node_flag except Exception as e : raise Exception ("WorkFlowDataFrame parms are not set " + str(e))
def _init_node_parm(self, key): """ Init parameter from workflow_data_frame :return: """ wf_net_conf = WorkFlowNetConfML(key) self.model_path = wf_net_conf.model_path self.ml_class = wf_net_conf.ml_class self.config = wf_net_conf.config self.batch_size = 10000 self.model_type = wf_net_conf.model_type #Todo 어떻게 꺼내는지 승우씨한테 물어볼것 _wf_data_conf = wf_data_conf(key.split('_')[0]+'_'+key.split('_')[1]+'_'+'dataconf_node') self.data_conf = _wf_data_conf.conf self.label = _wf_data_conf.label self.cell_feature = _wf_data_conf.cell_feature self.cross_cell = _wf_data_conf.cross_cell self.extend_cell_feature = _wf_data_conf.extend_cell_feature self.label_values = _wf_data_conf.label_values _wf_data_node = wf_data_node(key.split('_')[0] + '_' + key.split('_')[1] + '_' + 'data_node') self.multi_read_flag = _wf_data_node.multi_node_flag self.predict_path = _wf_data_node.predict_path
def src_local_handler(self, conf_data): """ Make h5 & tfrecord for multi treading Arguments: conf_data : data_source_path. etc """ try: logging.info("Data node starting : {0}".format(conf_data['node_id'])) fp_list = utils.get_filepaths(self.data_src_path, file_type='csv') _multi_node_flag = self.multi_node_flag eval_data = dict((_i, _k) for _i, _k in self.cls_list.items() if 'evaldata' in _i) try: #data conf node id 찾기 data_conf_node_id = '' for _i, _k in self.cls_list.items(): if 'dataconf' in _i: data_conf_node_id = _i #eval 카테고리 데이터를 가져 오기 위해서 필요 Evalnode가 실행할때는 필요 없음 if 'data_node' not in conf_data['node_id']: self.get_eval_node_file_list(conf_data) data_dfconf_list = data_conf_node_id for file_path in fp_list: df_csv_read = self.load_csv_by_pandas(file_path) if 'dataconf' in data_dfconf_list: self.data_conf = self.make_column_types(df_csv_read, conf_data['node_id'], data_conf_node_id) # make columns type of csv #eval 것도 같이 가져와서 unique value를 구해야함 #self.make_unique_value_each_column(df_csv_read,conf_data['node_id']) self.create_hdf5(self.data_store_path, df_csv_read) #Todo 뽑아서 함수화 시킬것 #for wdnn #Wdnn인경우 data_dfconf가 무조껀 한개만 존재 하므로 아래와 같은 로직이 가능 if len(data_dfconf_list) > 0: #Todo 정리가능 _key =data_dfconf_list _nnid = _key.split('_')[0] _ver = _key.split('_')[1] _node = 'dataconf_node' _wf_data_conf = wf_data_conf(_key) if hasattr(_wf_data_conf,'label') == True: # label check _label = _wf_data_conf.label _labe_type = _wf_data_conf.label_type origin_labels_list = _wf_data_conf.label_values if hasattr(_wf_data_conf,'label_values') else list() #처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 compare_labels_list = self.set_dataconf_for_labels(df_csv_read,_label) self.combined_label_list = utils.get_combine_label_list(origin_labels_list,compare_labels_list ) #리스트를 합친다음 DB에 업데이트 한다. _data_conf = dict() _data_conf['label_values'] = self.combined_label_list if _labe_type == 'CONTINUOUS': _data_conf['label_values'] = list() _wf_data_conf.put_step_source(_nnid, _ver,_node, _data_conf ) # make tfrecord for multi Threading if _multi_node_flag == True: skip_header = False # Todo Have to remove if production self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read,_label, _labe_type) dir = self.data_src_path+"/backup" if not os.path.exists(dir): os.makedirs(dir) #os.mkdir(self.data_src_path+"/backup") file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk" shutil.copy(file_path,self.data_src_path+"/backup/"+file_name_bk ) os.remove(file_path) #승우씨것 except Exception as e: logging.error("Datanode making h5 or tfrecord error".format(e)) raise Exception(e) logging.info("Data node end : {0}".format(conf_data['node_id'])) return None except Exception as e: raise Exception(e)