def run_train_job(self, framework, conf: ExperimentConf, no_experiment: int, model_input_features:list, n_rows: int): # 1. create train conf job_name = f"train_job_{conf.dataset_name}_{framework}_{util.human_datetime()}" brevity_framework_dict = {FrameworkType.DeepTables: "dt", FrameworkType.GBM: "gbm"} model_name = util.model_name(conf.dataset_name, no_experiment) #f"{conf.dataset_name}_{no_experiment}" model_dir = util.model_dir(conf.dataset_name, model_name) os.makedirs(model_dir) train_source_code_path = P.join(model_dir, 'train.py') train_log = P.join(model_dir, f"train.log") train_job_conf = TrainJobConf(framework=framework, name=job_name, model_name=model_name, searcher=TrainJobConf.Searcher.MCTSSearcher, max_trails=consts.TRAIN_MODE_MAX_TRAILS_MAPPING[conf.train_mode], search_space=TrainJobConf.SearchSpace.Minimal) # 2. insert to db with db.open_session() as s: self.create_temporary_model(s, model_name, no_experiment, model_input_features, conf, train_job_conf) # 3. generate train source code train_source_code, notebook_content = self.generate_code(model_name, model_input_features, n_rows, train_job_conf, conf) with open(train_source_code_path, 'w', encoding='utf-8') as f: f.write(train_source_code) notebook_file_path = P.join(model_dir, 'train.ipynb') with open(notebook_file_path, 'w', encoding='utf-8') as f: f.write(notebook_content) # 4. run train process # Note: if plus & at end of command, the process id will be plus 1 cause a bug command = f"nohup {sys.executable} {train_source_code_path} 1>{train_log} 2>&1" log.info(f"Run train job command: \n{command}") log.info(f"Log file:\ntail -f {train_log}") log.info(f"Train source code:\n {train_source_code_path}") train_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE) with db.open_session() as s: self.model_dao.update_model_by_name(s, model_name, {"pid": train_process.pid}) return train_job_conf.to_dict()
def delete(self, dataset_name): with db.open_session() as s: # 1. delete record dataset = self.dataset_dao.require_by_name(s, dataset_name) is_temporary = dataset.is_temporary self.dataset_dao.delete(s, dataset_name) # 2. delete file only not temporary if is_temporary is False: if "/" not in dataset_name and len(dataset_name) > 1: if len(consts.PATH_DATASET) > 1: dataset_dir = P.join(consts.PATH_DATASET, dataset_name) if P.exists(dataset_dir) and P.isdir(dataset_dir): logger.info(f"Remove file at: {dataset_dir}") shutil.rmtree(dataset_dir) else: raise ValueError( f"dataset dir {dataset_dir} is not dir or not exists, may be a bug here." ) else: raise ValueError( "Data dir too short, can not delete. ") else: raise ValueError( "dataset name contains '/' or length too short.")
def retrieve(self, dataset_name, n_top_value): with db.open_session() as s: dataset = self.dataset_dao.require_by_name(s, dataset_name) dict_value = util.sqlalchemy_obj_to_dict(dataset) dict_value['file_path'] = util.relative_path(dataset.file_path) # if dataset.status == DatasetEntity.Status.Analyzed: # for i, f in enumerate(dict_value['features']): # if f['type'] in [FeatureType.Categorical, FeatureType.Continuous]: # if f['unique']['value'] > n_top_value: # # calc top {n_count_value} # extension = f['extension'] # sorted(extension['value_count'], key=lambda _: _['value']) # # top_value_count = extension['value_count'][: n_top_value] # remain_value_count = extension['value_count'][n_top_value:] # remain_count = 0 # for remain_dict in remain_value_count: # remain_count = remain_count + remain_dict['value'] # # top_value_count.append( # FeatureValueCount(type="Remained_SUM", value=remain_count).to_dict() # ) # dict_value['features'][i]['extension']['value_count'] = top_value_count # extension['value_count'] = top_value_count # dict_value['detail'] = dict_value['extension'] extension = dict_value.pop('extension') dict_value['extension'] = {"sample_conf": extension['sample_conf']} return dict_value
def brevity_dataset_pagination(self, req_dict): # 1. read param page_num = util.require_in_dict(req_dict, 'page_num', int, default=1) page_size = util.require_in_dict(req_dict, 'page_size', int, default=10) query = util.get_from_dict(req_dict, 'query', str) order_by = util.require_in_dict(req_dict, 'order_by', str, default="create_datetime") order = util.require_in_dict(req_dict, 'order', str, default="desc") allow_order_by_fields = [ "create_datetime", "n_experiments", "size", "n_rows", "n_cols" ] if order_by not in allow_order_by_fields: raise ValueError( f"Order by field should in {','.join(allow_order_by_fields)}, but input is: {order_by}" ) allow_order_strategies = ["desc", "asc"] if order not in allow_order_strategies: raise ValueError( f"order strategy should in {','.join(allow_order_strategies)}, but input is: {order}" ) if page_num < 1: raise ValueError("Param page_num should > 1 .") if page_size < 0: raise ValueError("Param page_size should > 0 .") def _handle(model_dao, session, dataset: DatasetEntity): d = util.sqlalchemy_obj_to_dict(dataset) d['file_path'] = util.relative_path(dataset.file_path) d['create_datetime'] = util.to_timestamp(dataset.create_datetime) d['n_experiments'] = model_dao.query_n_experiment( session, dataset.name) del d['features'] del d['feature_summary'] del d['extension'] return d # 2. query with db.open_session() as s: datasets, total = self.dataset_dao.pagination( s, page_num, page_size, query, order_by, order) datasets = [_handle(self.model_dao, s, d) for d in datasets] return datasets, total
def retrieve_model(self, model_name): def _replace_NaN(v): if v is None: return v else: if math.isnan(v): return None else: return v with db.open_session() as s: model = self.require_model(s, model_name) # handle trails if model.trails is not None and len(model.trails)>0: param_names = [k for k in model.trails[0].params] trail_data_dict_list = [] trail_params_values = [] trail_index = [] for t in model.trails: param_values = [_replace_NaN(t.params.get(n)) for n in param_names] trail_params_values.append(param_values) trail_index.append(t.trail_no) df_train_params = pd.DataFrame(data=trail_params_values, columns=param_names) # remove if all is None df_train_params.dropna(axis=1, how='all', inplace=True) for i, t in enumerate(model.trails): trail_data_dict = {"reward": t.reward, "params": [_replace_NaN(_) for _ in df_train_params.iloc[i].tolist()], "elapsed": t.elapsed} trail_data_dict_list.append(trail_data_dict) if len(df_train_params.columns.values) > 0: # ensure not all params is None trails_dict = { "param_names": df_train_params.columns.tolist(), "data": trail_data_dict_list } else: trails_dict = {} else: trails_dict = {} model_dict = model.to_dict() # update trails model_dict['trails'] = trails_dict model_dict['model_path'] = util.relative_path(model_dict['model_path']) model_dict['escaped'] = model.escaped_time_by_seconds() model_dict['log_file_path'] = model.log_file_path() model_dict['train_source_code_path'] = model.train_source_code_path() model_dict['train_notebook_uri'] = model.train_notebook_uri() return model_dict
def create_dataset(self, dataset_name, temporary_dataset_name): with db.open_session() as s: # 1. check temporary dataset temporary_dataset = s.query(DatasetEntity).filter( DatasetEntity.name == temporary_dataset_name).first() if temporary_dataset is None: raise EntityNotExistsException(DatasetEntity, temporary_dataset_name) if temporary_dataset.status != DatasetEntity.Status.Analyzed: raise IllegalParamException( 'dataset_name', temporary_dataset_name, f'Dataset is not ready, status is {temporary_dataset.status}' ) # 2. check dataset name new_dataset_dir = P.join(consts.PATH_DATASET, dataset_name) if P.exists(new_dataset_dir): raise IllegalParamException( 'dataset_name', dataset_name, f'File path {new_dataset_dir} of dataset already exits') # 3. Read temporary dataset # temporary_dataset_dict = util.loads(temporary_dataset.extension) # 4. make dataset dir, can not rollback, but should enough robust below new_dataset_dir = P.join(consts.PATH_DATASET, dataset_name) os.makedirs(new_dataset_dir, exist_ok=False) # 5. move file file_path = temporary_dataset.get_abs_file_path() new_dataset_file_path = P.join( new_dataset_dir, f'data{util.get_file_suffix(file_path)}') shutil.copy(file_path, new_dataset_file_path) # 6. create meta.json # temporary_dataset_dict['name'] = dataset_name # temporary_dataset_dict['create_datetime'] = util.get_now_long() # with open(P.join(new_dataset_dir, 'meta.json'), 'w') as f: # f.write(util.dumps(temporary_dataset_dict)) properties = { "is_temporary": False, "name": dataset_name, "file_path": new_dataset_file_path } # 7. change status affect_rows = s.query(DatasetEntity).filter( DatasetEntity.name == temporary_dataset_name, DatasetEntity.is_temporary == True).update(properties) if affect_rows != 1: raise Exception("Update dataset failed.")
def predict(self, dataset_name, model_name, req_dict: dict): # 1. read params file_path = util.require_in_dict(req_dict, 'file_path', str) reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list) upload_took = util.require_in_dict(req_dict, 'upload_took', float) if reserved_cols is None or len(reserved_cols) < 1: reserved_cols_str = "" else: reserved_cols_str = ",".join(reserved_cols) # 2. check params with db.open_session() as s: self.model_dao.require_by_name(s, model_name).to_model_bean() dataset_stats = self.dataset_dao.require_by_name( s, dataset_name).to_dataset_stats() predict_job_name = util.predict_job_name(dataset_name) abs_file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(abs_file_path): raise ValueError(f"Input file not exists: {abs_file_path}") if not P.isfile(abs_file_path): raise ValueError(f"Input file is not file: {abs_file_path}") # 3. add upload step upload_extension = {"file_size": P.getsize(abs_file_path)} upload_step = JobStep(type=PredictStepType.Upload, status=JobStep.Status.Succeed, took=upload_took, datetime=util.get_now_long(), extension=upload_extension) self.add_predict_process_step(model_name, predict_job_name, upload_step) # 4. execute command model_dir = util.model_dir(dataset_name, model_name) predict_log_path = P.join(model_dir, f"{predict_job_name}.log") if not dataset_stats.has_header: default_headers = ",".join( [f.name for f in dataset_stats.features]) else: default_headers = None command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers} --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {predict_log_path}") os.system(command) # ha ha ha return predict_job_name
def train_process_terminated(self, model_name): with db.open_session() as s: # because of check database 1 seconds every time fix read is running but before handle_models finished. # 1. check status, only running can change to finished model = self.model_dao.find_by_name(s, model_name) if model.status == ModelStatusType.Running: _now = util.get_now_datetime() properties = { "status": ModelStatusType.Failed, "finish_datetime": _now, "last_update_datetime": _now } self.model_dao.update_model_by_name(s, model_name, properties) else: log.warning(f"Train process is already finished, model = {model_name}")
def get(self, dataset_name, model_name, batch_predict_job_name, *args, **kwargs): # 1. query all the message of request_id todo move to service with db.open_session() as s: messages = s.query(MessageEntity).filter(MessageEntity.author == batch_predict_job_name).order_by(MessageEntity.create_datetime.asc()).all() messages_dict_list = [] for m in messages: messages_dict_list.append(util.loads(m.content)) # 2. response response = \ { "batch_predict_job_name": batch_predict_job_name, "steps": messages_dict_list } self.response_json(response)
def _handle_label_col(self, dataset_name, label_col, file_path): # calc correlation # 1. update label col, Avoiding that http request send first and not label_col not updated with db.open_session() as s: self.dataset_dao.update_by_name(s, dataset_name, {"label_col": label_col}) # 2. start a process analyze_pearson_job_name = util.analyze_data_job_name(P.basename(file_path)) std_log = P.join(util.dataset_dir(dataset_name), f"{analyze_pearson_job_name}.log") command = f"nohup {sys.executable} {util.script_path('analyze_correlation_job.py')} --dataset_name={dataset_name} --label_col={label_col} --job_name={analyze_pearson_job_name} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1" calc_correlation_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE) log.info(f"Run calculate pearson command: \n{command}") log.info(f"Log file:\ntail -f {std_log}") log.info(f"Process id is {calc_correlation_process.pid}")
def post(self, dataset_name, *args, **kwargs): # check it in db with db.open_session() as s: dataset = self.dataset_service.dataset_dao.find_by_name( s, dataset_name) if dataset is not None: raise ValueError(f"Dataset {dataset_name} already exists ") # check it in file system dataset_dir = util.dataset_dir(dataset_name) if P.exists(dataset_dir): raise ValueError( f"Path {dataset_dir} already exists even dataset {dataset_name} not exists " ) self.response_json({})
def get_experiments(self, dataset_name, page_num, page_size): # 1. validation params if page_num < 1: raise ValueError("Param page_num should >= 1'") if page_size < 1: raise ValueError("Param page_size should >= 1'") def f(model: Model): model_extension = model.extension train_mode = model_extension['experiment_conf']["train_mode"] # extension must has experiment_conf and train_mode and not None max_trails = consts.TRAIN_MODE_MAX_TRAILS_MAPPING[train_mode] # must has {train_mode} d = \ { "name": model.name, "no_experiment": m.no_experiment, "train_mode": m.extension['experiment_conf']['train_mode'], "target_col": m.extension['experiment_conf']['label_col'], "metric_name": consts.TASK_TYPE_OPTIMIZE_METRIC_MAPPING[m.task_type], # m.extension['experiment_conf']['optimize_metric'], "status": model.status, "score": model.score, "engine": m.extension['train_job_conf']['framework'], "escaped": model.escaped_time_by_seconds(), "log_file_path": model.log_file_path(), "train_source_code_path": model.train_source_code_path(), "train_notebook_uri": model.train_notebook_uri(), "train_trail_no": 0 if model.train_trail_no is None else model.train_trail_no, "max_train_trail_no": max_trails, "estimated_remaining_time": ExperimentService.calc_remain_time(model), "model_file_size": model.model_file_size } return d with db.open_session() as s: # check dataset dataset = self.dataset_dao.require_by_name(s, dataset_name) models, total = self.model_dao.find_by_dataset_name(s, dataset_name, page_num, page_size) experiments = [] for m in models: model_as_dict = f(m) experiments.append(model_as_dict) # return sorted(experiments, key=lambda x: x['no_experiment'], reverse=True) return experiments, total
def get(self, dataset_name, analyze_job_name, *args, **kwargs): # 1. validate param if analyze_job_name is None: raise IllegalParamException("analyze_job_name", None, "not empty") # 2. query all the message of request_id todo move to service with db.open_session() as s: messages = s.query(MessageEntity).filter( MessageEntity.author == analyze_job_name).order_by( MessageEntity.create_datetime.asc()).all() messages_dict_list = [] for m in messages: messages_dict_list.append(util.loads(m.content)) # 3. response response = \ { "analyze_job_name": analyze_job_name, "steps": messages_dict_list } self.response_json(response)
def recommended_train_configuration(self, dataset_name, req_dict): datetime_series_col = None # todo support datetime_series target_col = req_dict.get('target_col') # 1. read last train-job with db.open_session() as s: # 1.1. check dataset name dataset_stats = self.dataset_dao.require_by_name(s, dataset_name).to_dataset_stats() # 1.2. query models last_model = self.model_dao.checkout_one(self.model_dao.find_by_dataset_name(s, dataset_name, 1, 1)[0]) # 2. infer conf if last_model is None: experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col) else: _experiment_conf: ExperimentConf = self.get_recommended_conf_from_history(last_model) if target_col is not None and _experiment_conf.label_col != target_col: log.info(f"Change label from {_experiment_conf.label_col} to {target_col}, " + f"and recommended params using new target.") experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col) else: # use history experiment_conf = _experiment_conf train_validation_holdout = experiment_conf.train_validation_holdout cross_validation = experiment_conf.cross_validation conf = \ { "label_col": experiment_conf.label_col, "task_type": experiment_conf.task_type, "pos_label": experiment_conf.pos_label, "train_mode": experiment_conf.train_mode, "partition_strategy": ExperimentConf.PartitionStrategy.TrainValidationHoldout, "train_validation_holdout": train_validation_holdout.to_dict() if train_validation_holdout is not None else None, "cross_validation": cross_validation.to_dict() if cross_validation is not None else None, "datetime_series_col": datetime_series_col } return conf
def add_predict_process_step(self, model_name: str, job_name: str, step: JobStep): step_type = step.type with db.open_session() as s: # 1. check temporary model exists model = self.model_dao.require_by_name(s, model_name) # 2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 3. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message)
def post(self, dataset_name, *args, **kwargs): req_dict = self.get_request_as_dict_if_json() feature_name = util.require_in_dict(req_dict, 'feature_name', str) with db.open_session() as s: dataset = self.dataset_service.dataset_dao.require_by_name( s, dataset_name) features = Feature.load_dict_list(dataset.features) target_f = None for f in features: if f.name == feature_name: target_f = f break if target_f is None: raise ValueError(f"Feature name = {feature_name} not found. ") task_type = self.experiment_service._infer_task_type(target_f) resp = {"task_type": task_type, "feature_name": feature_name} self.response_json(resp)
def create_temporary_dataset_from_file(self, data_path): """This can not in setup_class(cls) nor setUpClass(cls), setUp, because http server not ready. """ super(WithTemporaryDatasetTestCase, self).setUp() # must invoke or not create http server from cooka.common import consts consts.SERVER_PORTAL = self.get_url('') # use temporary server # 1. upload boundary = uuid.uuid4().hex headers = { "Content-Type": "multipart/form-data; boundary=%s" % boundary } producer = partial(self.multipart_producer, boundary, data_path) upload_response = self.fetch(path='/api/resource', method="POST", body_producer=producer, headers=headers) upload_response_body = self.assert_response_and_get(upload_response) upload_file_path = upload_response_body.get('path') upload_took = upload_response_body.get('took') assert upload_file_path is not None assert upload_took is not None # 2. send request body = { "sample_strategy": "random_rows", "percentage": 30, "n_rows": 1000, "file_path": upload_file_path, "upload_took": upload_took, "source_type": "upload", } str_body = util.dumps(body) # 3. validate code create_response_body = self.assert_response_and_get( self.fetch(path='/api/temporary-dataset', method="POST", body=str_body, headers=headers)) print(f"create response body:\n {create_response_body}") # 3. poll dataset message temporary_dataset_name = create_response_body["temporary_dataset_name"] analyze_job_name = create_response_body["analyze_job_name"] excepted_event = [ AnalyzeStep.Types.Upload, AnalyzeStep.Types.Load, AnalyzeStep.Types.Analyzed ] analyze_passed = False poll_job_response_body = None for i in range(10): # poll for 30 times every time one second poll_job_response = self.fetch( f'/api/dataset/{temporary_dataset_name}/analyze-job/{analyze_job_name}', method="GET") poll_job_response_body = self.assert_response_and_get( poll_job_response) events = poll_job_response_body['steps'] events.sort(key=lambda x: x['datetime']) events_type = [event["type"] for event in events] if excepted_event == events_type: # has all excepted type order by datetime # validate every analyze_passed = True break if AnalyzeStep.Types.End in events_type: break time.sleep(1) assert analyze_passed, f"{poll_job_response_body}" # 4. retrieve dataset and check detail with db.open_session() as s: temporary_dataset = s.query(DatasetEntity).filter( DatasetEntity.name == temporary_dataset_name).first() assert temporary_dataset is not None, f'Temporary dataset = {temporary_dataset_name} create failed' assert len(temporary_dataset.extension) > 0 assert temporary_dataset.status == DatasetEntity.Status.Analyzed return temporary_dataset_name
def find_running_model(self): with db.open_session() as s: return self.model_dao.find_running_model(s)
def add_analyze_process_step(self, dataset_name, analyze_job_name, step: JobStep): step_type = step.type with db.open_session() as s: # 1.1. check dataset exists d = s.query(DatasetEntity).filter( DatasetEntity.name == dataset_name).first() if d is None: raise EntityNotExistsException(DatasetEntity, dataset_name) # 1.2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == analyze_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 2. handle event with db.open_session() as s: # 2.1. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=analyze_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # 2.2. handle analyze event if step_type == AnalyzeStep.Types.Analyzed: # update temporary dataset # todo handle failed analyze if step.status == JobStep.Status.Succeed: hints = step.extension.pop("hints") d_stats = DatasetStats.load_dict(step.extension) features_str = [f.to_dict() for f in d_stats.features] update_fields = \ { "has_header": d_stats.has_header, "extension": step.extension, "n_cols": d_stats.n_cols, "n_rows": d_stats.n_rows, "features": features_str, "hints": hints, "feature_summary": d_stats.feature_summary.to_dict(), "status": DatasetEntity.Status.Analyzed } else: update_fields = {"status": DatasetEntity.Status.Failed} self.dataset_dao.update_by_name(s, dataset_name, update_fields) elif step_type == AnalyzeStep.Types.PatchCorrelation: # 1. check dataset status, only analyzed can calc relativity dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset.status != AnalyzeStep.Types.Analyzed: raise ValueError( f"Dataset {dataset_name} status is not {AnalyzeStep.Types.Analyzed} ." ) request_label_col = step.extension.get("label_col") if request_label_col != dataset.label_col: raise ValueError( f"Dataset {dataset_name} label col is {dataset.label_col} but received result is for {request_label_col}" ) # 2. read extension corr_dict = step.extension.get('corr') # 3. load & update features features = dataset.to_dataset_stats().features for f in features: correlation = corr_dict.get(f.name) f.correlation = FeatureCorrelation( value=correlation, status=FeatureCorrelation.calc_status( correlation, request_label_col == f.name)) # 4. sort features by abs correlation features = sorted(features, key=lambda f: abs(f.correlation.value), reverse=True) feature_dict_list = [] for f in features: feature_dict_list.append(f.to_dict()) # 5. push back database self.dataset_dao.update_by_name( s, dataset_name, {"features": feature_dict_list})
def add_train_process_step(self, train_job_name, req_dict): # [1]. read & check params step_type = util.require_in_dict(req_dict, 'type', str) step_status = util.require_in_dict(req_dict, 'status', str) step_extension = util.get_from_dict(req_dict, 'extension', dict) if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]: raise ValueError(f"Unknown step type = {step_type}") if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]: raise ValueError(f"Unknown status = {step_status}") # [2]. save message with db.open_session() as s: # [2.1]. check temporary model exists model = self.model_dao.find_by_train_job_name(s, train_job_name) model_name = model.name # [2.2]. check event type, one type one record messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]: raise Exception(f"Event type = {step_type} already exists .") # [2.3]. create a new message content = util.dumps(req_dict) message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # [2.4]. handle analyze event current_progress = model.progress # todo check in code body self._check_progress_change(step_type, current_progress) # add failed status if step_type == TrainStep.Types.Evaluate: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"performance": step_extension['performance']}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.Load: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.OptimizeStart: pass # train_trail_no = step_extension.get('trail_no') # if train_trail_no is None or not isinstance(train_trail_no, int): # raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}") # # upload trail number # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no}) elif step_type == TrainStep.Types.Optimize: train_trail_no = step_extension.get('trail_no') # update trails # load current trail and append new trails = model.trails if model.trails is None: trails = [] trails.append(step_extension) self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails}) elif step_type == TrainStep.Types.Persist: model_file_size = step_extension['model_file_size'] self._update_model(s, model_name, step_type, {"model_file_size": model_file_size, "status": ModelStatusType.Succeed, "finish_datetime": util.get_now_datetime()}) else: self._update_model(s, model_name, step_type, {})
def is_dataset_exists(self, dataset_name): with db.open_session() as s: d = self.dataset_dao.find_by_name(s, dataset_name) exists_in_db = d is not None exists_file = P.exists(P.join(consts.PATH_DATABASE, dataset_name)) return exists_in_db or exists_file
def _create_temporary_dataset(self, source_type, file_path, took, sample_conf: SampleConf): now = util.get_now_datetime() file_name = P.basename(file_path) temporary_dataset_name = self.choose_temporary_dataset_name( file_name) # use a long name analyze_job_name = util.analyze_data_job_name( util.cut_suffix(file_name), now) file_size = P.getsize(file_path) # 2. create record td = DatasetEntity(name=temporary_dataset_name, file_size=file_size, is_temporary=True, status=DatasetEntity.Status.Created, source_type=source_type, file_path=file_path, file_name=file_name, create_datetime=now, last_update_datetime=now) with db.open_session() as s: s.add(td) # 3. send file transfer step if source_type == DatasetEntity.SourceType.Upload: step = JobStep(type=AnalyzeStep.Types.Upload, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) elif source_type == DatasetEntity.SourceType.Import: step = JobStep(type=AnalyzeStep.Types.Copy, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) # 4. create analyze config conf = AnalyzeJobConf(job_name=analyze_job_name, dataset_name=temporary_dataset_name, sample_conf=sample_conf, path=file_path, temporary_dataset=True, label_col=None) # 5. start new process analyze_config_string = util.dumps(conf.to_dict()) logger.info(f"Analyze job conf: {analyze_config_string}") python_executable = sys.executable temporary_dataset_dir = util.temporary_dataset_dir( temporary_dataset_name) os.makedirs(temporary_dataset_dir, exist_ok=True) std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log") command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {std_log}") # JobManager.instance().run_job(job) os.system(command) # ha ha ha return temporary_dataset_name, analyze_job_name
def preview(self, dataset_name: str, page_num: int, page_size: int) -> RespPreviewDataset: """ Args: dataset_name: page_num: start from 1 page_size: Returns: """ # 1. validation params if page_num < 1: raise ValueError("Param page_num should >= 1'") if page_size < 1: raise ValueError("Param page_size should >= 1'") # 2. retrieve dataset with db.open_session() as s: dataset = self.dataset_dao.require_by_name(s, dataset_name) file_path = dataset.file_path if not P.exists(file_path): raise FileNotFoundError(file_path) dataset_stats = dataset.to_dataset_stats() relative_file_path = util.relative_path(dataset_stats.file_path) # 3. read data dataset_headers = [f.name for f in dataset_stats.features] dataset_headers.insert(0, "No. ") # dataset_headers.insert(0, "number") if dataset_stats.has_header: iterator_df = pd.read_csv(file_path, chunksize=page_size) else: iterator_df = pd.read_csv(file_path, chunksize=page_size, header=None) # 4. seek pages, page num start from 1 # e.g. if page_num = 1 while loop will do 0 times, below code will invoke next(iterator_df) and get data current_page = 1 while current_page < page_num: try: next(iterator_df) # no Reference, will be gc current_page = current_page + 1 except StopIteration: # if page_num is too large , no data returned return RespPreviewDataset(headers=dataset_headers, rows=None, count=dataset_stats.n_rows, file_path=relative_file_path) # 5. hit data try: page_df: pd.DataFrame = next(iterator_df) # 5.1. make index start_line_no = (current_page - 1) * page_size + 1 # start from 1 df_index = page_df.index = pd.RangeIndex( start_line_no, start_line_no + page_df.shape[0]) page_df.index = df_index values = page_df.to_records(index=True).tolist() return RespPreviewDataset(headers=dataset_headers, rows=values, count=dataset_stats.n_rows, file_path=relative_file_path) except StopIteration: return RespPreviewDataset(headers=dataset_headers, rows=None, count=dataset_stats.n_rows, file_path=relative_file_path)
def experiment(self, req_dict: dict): # 1. read params label_col = util.require_in_dict(req_dict, 'label_col', str) pos_label = util.get_from_dict(req_dict, 'pos_label', object) train_mode = util.get_from_dict(req_dict, 'train_mode', str) partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str) dataset_name = util.require_in_dict(req_dict, 'dataset_name', str) holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int) # todo check datetime_series_col datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str) experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str) if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]: raise ValueError(f"Unseen experiment_engine {experiment_engine}") # 2. check partition_strategy cross_validation = None train_validation_holdout = None if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation: cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict) n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int) if 1 < n_folds <= 50: cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage) else: raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}") elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout: train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict) train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int) validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int) if train_percentage + validation_percentage + holdout_percentage != 100: raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.") train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage) else: raise ValueError(f"Unknown partition strategy = {partition_strategy}") # 2. Retrieve data with db.open_session() as s: # 2.1. check dataset dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset is None: raise ValueError(f"Dataset={dataset_name} not exists.") dataset_stats = dataset.to_dataset_stats() # 2.2. generate new experiment name no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1 # 3. ensure dataset label is latest if dataset_stats.label_col is None: log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) if dataset_stats.label_col != label_col: log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) # 4. calc task type # 4.1. find label label_f = self._find_feature(dataset_stats.features, label_col) if label_f is None: raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .") task_type = self._infer_task_type(label_f) # 4.2. check pos_label if task_type == TaskType.BinaryClassification: if pos_label is None: raise ValueError("Pos label can not be None when it's binary-classify") else: if isinstance(pos_label, str): if len(pos_label) < 1: raise ValueError("Pos label can not be empty when it's binary-classify") # 5. run experiment if not dataset_stats.has_header: dataset_default_headers = [f.name for f in dataset_stats.features] else: dataset_default_headers = None conf = ExperimentConf(dataset_name=dataset_name, dataset_has_header=dataset_stats.has_header, dataset_default_headers=dataset_default_headers, train_mode=train_mode, label_col=label_col, pos_label=pos_label, task_type=task_type, partition_strategy=partition_strategy, cross_validation=cross_validation, train_validation_holdout=train_validation_holdout, datetime_series_col=datetime_series_col, file_path=dataset_stats.file_path) model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features))) if experiment_engine == FrameworkType.GBM: train_conf = self.run_train_job(FrameworkType.GBM, conf, no_experiment, model_input_features, dataset_stats.n_rows) else: train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features, dataset_stats.n_rows) return { "no_experiment": no_experiment, "experiment_conf": conf.to_dict(), "train_job_conf": train_conf }