def predict(self, dataset_name, model_name, req_dict: dict): # 1. read params file_path = util.require_in_dict(req_dict, 'file_path', str) reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list) upload_took = util.require_in_dict(req_dict, 'upload_took', float) if reserved_cols is None or len(reserved_cols) < 1: reserved_cols_str = "" else: reserved_cols_str = ",".join(reserved_cols) # 2. check params with db.open_session() as s: self.model_dao.require_by_name(s, model_name).to_model_bean() dataset_stats = self.dataset_dao.require_by_name( s, dataset_name).to_dataset_stats() predict_job_name = util.predict_job_name(dataset_name) abs_file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(abs_file_path): raise ValueError(f"Input file not exists: {abs_file_path}") if not P.isfile(abs_file_path): raise ValueError(f"Input file is not file: {abs_file_path}") # 3. add upload step upload_extension = {"file_size": P.getsize(abs_file_path)} upload_step = JobStep(type=PredictStepType.Upload, status=JobStep.Status.Succeed, took=upload_took, datetime=util.get_now_long(), extension=upload_extension) self.add_predict_process_step(model_name, predict_job_name, upload_step) # 4. execute command model_dir = util.model_dir(dataset_name, model_name) predict_log_path = P.join(model_dir, f"{predict_job_name}.log") if not dataset_stats.has_header: default_headers = ",".join( [f.name for f in dataset_stats.features]) else: default_headers = None command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers} --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {predict_log_path}") os.system(command) # ha ha ha return predict_job_name
def callback(url, type, status, took, extension, **kwargs): req_body_dict = \ { "type": type, "status": status, "took": took, "datetime": util.get_now_long(), "extension": extension } req_body = util.dumps(req_body_dict) logger.info(f"Send process event: \n{url}\n{req_body}") # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1 response = requests.post(url, data=req_body.encode('utf-8'), timeout=TIMEOUT, headers=HEADERS) _checkout_response_json(response)
def _create_temporary_dataset(self, source_type, file_path, took, sample_conf: SampleConf): now = util.get_now_datetime() file_name = P.basename(file_path) temporary_dataset_name = self.choose_temporary_dataset_name( file_name) # use a long name analyze_job_name = util.analyze_data_job_name( util.cut_suffix(file_name), now) file_size = P.getsize(file_path) # 2. create record td = DatasetEntity(name=temporary_dataset_name, file_size=file_size, is_temporary=True, status=DatasetEntity.Status.Created, source_type=source_type, file_path=file_path, file_name=file_name, create_datetime=now, last_update_datetime=now) with db.open_session() as s: s.add(td) # 3. send file transfer step if source_type == DatasetEntity.SourceType.Upload: step = JobStep(type=AnalyzeStep.Types.Upload, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) elif source_type == DatasetEntity.SourceType.Import: step = JobStep(type=AnalyzeStep.Types.Copy, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) # 4. create analyze config conf = AnalyzeJobConf(job_name=analyze_job_name, dataset_name=temporary_dataset_name, sample_conf=sample_conf, path=file_path, temporary_dataset=True, label_col=None) # 5. start new process analyze_config_string = util.dumps(conf.to_dict()) logger.info(f"Analyze job conf: {analyze_config_string}") python_executable = sys.executable temporary_dataset_dir = util.temporary_dataset_dir( temporary_dataset_name) os.makedirs(temporary_dataset_dir, exist_ok=True) std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log") command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {std_log}") # JobManager.instance().run_job(job) os.system(command) # ha ha ha return temporary_dataset_name, analyze_job_name