def handle_models(self, models: list): for m in models: m: Model = m pid = m.pid if pid is None: pass # logger.warning(f"Model {m.name} , training process pid is None. ") else: try: status = psutil.Process(pid).status() if pid not in self.process_status_mapping: self.process_status_mapping[pid] = status logger.info( f"Model {m.name} , pid is {pid} process status is {status} " ) else: if self.process_status_mapping[pid] != status: logger.info( f"Model {m.name} , pid is {pid} process status changed from{ self.process_status_mapping[pid] } to {status} " ) self.process_status_mapping[pid] = status except Exception as e: # usually is NoSuchProcess # update if process finished logger.warning( f"Model {m.name} , training process pid = {pid} not exists. " ) self.experiment_service.train_process_terminated(m.name)
def delete(self, dataset_name): with db.open_session() as s: # 1. delete record dataset = self.dataset_dao.require_by_name(s, dataset_name) is_temporary = dataset.is_temporary self.dataset_dao.delete(s, dataset_name) # 2. delete file only not temporary if is_temporary is False: if "/" not in dataset_name and len(dataset_name) > 1: if len(consts.PATH_DATASET) > 1: dataset_dir = P.join(consts.PATH_DATASET, dataset_name) if P.exists(dataset_dir) and P.isdir(dataset_dir): logger.info(f"Remove file at: {dataset_dir}") shutil.rmtree(dataset_dir) else: raise ValueError( f"dataset dir {dataset_dir} is not dir or not exists, may be a bug here." ) else: raise ValueError( "Data dir too short, can not delete. ") else: raise ValueError( "dataset name contains '/' or length too short.")
def run(self) -> None: logger.info("[MonitorThread] loop running...") while 1: time.sleep(1) # 1. select all running models models = self.experiment_service.find_running_model() # 2. check process of running model self.handle_models(models)
def prepare(self): # open temporary file self.temporary_file_path = util.temporary_upload_file_path('upload_chunk') if not P.exists(P.dirname(self.temporary_file_path)): os.makedirs(P.dirname(self.temporary_file_path)) self.temporary_file = open(self.temporary_file_path, 'wb') # todo limit max length of file logger.info(f"Open path {self.temporary_file_path} to store upload file.") self.start_time = time.time() self.writed_size = 0
def start_server(): # 1. create web app application = CookaWebApplication(consts.PATH_DATABASE) application.listen(consts.SERVER_PORT) # 2. start thread pm = ProcessMonitor() pm.start() # 3. start io loop logger.info(f"Cooka running at: http://0.0.0.0:{consts.SERVER_PORT}") tornado.ioloop.IOLoop.instance().start()
def predict(self, dataset_name, model_name, req_dict: dict): # 1. read params file_path = util.require_in_dict(req_dict, 'file_path', str) reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list) upload_took = util.require_in_dict(req_dict, 'upload_took', float) if reserved_cols is None or len(reserved_cols) < 1: reserved_cols_str = "" else: reserved_cols_str = ",".join(reserved_cols) # 2. check params with db.open_session() as s: self.model_dao.require_by_name(s, model_name).to_model_bean() dataset_stats = self.dataset_dao.require_by_name( s, dataset_name).to_dataset_stats() predict_job_name = util.predict_job_name(dataset_name) abs_file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(abs_file_path): raise ValueError(f"Input file not exists: {abs_file_path}") if not P.isfile(abs_file_path): raise ValueError(f"Input file is not file: {abs_file_path}") # 3. add upload step upload_extension = {"file_size": P.getsize(abs_file_path)} upload_step = JobStep(type=PredictStepType.Upload, status=JobStep.Status.Succeed, took=upload_took, datetime=util.get_now_long(), extension=upload_extension) self.add_predict_process_step(model_name, predict_job_name, upload_step) # 4. execute command model_dir = util.model_dir(dataset_name, model_name) predict_log_path = P.join(model_dir, f"{predict_job_name}.log") if not dataset_stats.has_header: default_headers = ",".join( [f.name for f in dataset_stats.features]) else: default_headers = None command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers} --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {predict_log_path}") os.system(command) # ha ha ha return predict_job_name
async def get(self, path, **kwargs): if path in self.MissingResource: raise tornado.web.HTTPError(404, f"File {path} is missing") if path in ['', '/']: resource_path = "index.html" else: absolute_path = self.get_absolute_path(self.root, self.parse_url_path(path)) if not P.exists(absolute_path): logger.info(f"URI {path} not found, use index.html instead ") resource_path = "index.html" # handle 404 else: resource_path = path await super(AssetsHandler, self).get(resource_path)
def __init__(self, database_path): # 1. init handlers handlers = self.init_handlers() # 2. check database if not P.exists(database_path): database_dir = P.dirname(database_path) if not P.exists(database_dir): os.makedirs(database_dir, exist_ok=True) initialize_database() logger.info(f"Initialize database file {database_path}") static_path = P.join(P.dirname(P.abspath(__file__)), 'assets') # super(CookaApp, self).__init__(handlers, debug=True, static_path=static_path, static_url_prefix='/') super(CookaWebApplication, self).__init__(handlers, debug=False)
def handle_tornado_upload_file(http_handler, tornado_http_files, upload_start_time): # 1. check and read param tornado_http_file = tornado_http_files.get("file")[0] if tornado_http_file is None: raise MissingParamException("file") file_name = tornado_http_file['filename'] file_body = tornado_http_file['body'] file_size = util.human_data_size(len(file_body)) file_suffix = util.get_file_suffix(file_name) assert file_suffix in [ '.csv', '.tsv' ], 'Please check is your file suffix in [.csv, .tsv], current is: %s' % file_suffix origin_file_name = util.make_dataset_name(util.cut_suffix( file_name)) + file_suffix # for it in url, disk path readable # 2. open temporary file and write to local file temporary_file_path = util.temporary_upload_file_path(origin_file_name) if not P.exists(P.dirname(temporary_file_path)): os.makedirs(P.dirname(temporary_file_path)) logger.info(f"Open path {temporary_file_path} to store upload file.") with open(temporary_file_path, 'wb') as f: f.write(file_body) logger.info( f"Uploaded file finished at {temporary_file_path}, file size {file_size} ." ) upload_took = util.time_diff(time.time(), upload_start_time) # 3. response # relative_path = temporary_file_path[len(consts.PATH_DATA_ROOT)+1:] # relative path not start with / response = \ { "path": util.relative_path(P.abspath(temporary_file_path)), "size": file_size, "took": upload_took } http_handler.response_json(response)
def callback(url, type, status, took, extension, **kwargs): req_body_dict = \ { "type": type, "status": status, "took": took, "datetime": util.get_now_long(), "extension": extension } req_body = util.dumps(req_body_dict) logger.info(f"Send process event: \n{url}\n{req_body}") # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1 response = requests.post(url, data=req_body.encode('utf-8'), timeout=TIMEOUT, headers=HEADERS) _checkout_response_json(response)
def post(self, *args, **kwargs): # 1. close file file_size = util.human_data_size(self.writed_size) logger.info(f"Uploaded file finished at {self.temporary_file_path}, file size {file_size} .") if self.temporary_file is not None: self.temporary_file.flush() self.temporary_file.close() content_type = self.request.headers['Content-Type'] with open(self.temporary_file_path, 'rb') as f: fields = content_type.split(";") for field in fields: k, sep, v = field.strip().partition("=") if k == "boundary" and v: from tornado.escape import utf8 files = {} httputil.parse_multipart_form_data(utf8(v), f.read(), {}, files) handle_tornado_upload_file(self, files, self.start_time) return raise Exception("Handle upload failed.")
def recommended_train_configuration(self, dataset_name, req_dict): datetime_series_col = None # todo support datetime_series target_col = req_dict.get('target_col') # 1. read last train-job with db.open_session() as s: # 1.1. check dataset name dataset_stats = self.dataset_dao.require_by_name(s, dataset_name).to_dataset_stats() # 1.2. query models last_model = self.model_dao.checkout_one(self.model_dao.find_by_dataset_name(s, dataset_name, 1, 1)[0]) # 2. infer conf if last_model is None: experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col) else: _experiment_conf: ExperimentConf = self.get_recommended_conf_from_history(last_model) if target_col is not None and _experiment_conf.label_col != target_col: log.info(f"Change label from {_experiment_conf.label_col} to {target_col}, " + f"and recommended params using new target.") experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col) else: # use history experiment_conf = _experiment_conf train_validation_holdout = experiment_conf.train_validation_holdout cross_validation = experiment_conf.cross_validation conf = \ { "label_col": experiment_conf.label_col, "task_type": experiment_conf.task_type, "pos_label": experiment_conf.pos_label, "train_mode": experiment_conf.train_mode, "partition_strategy": ExperimentConf.PartitionStrategy.TrainValidationHoldout, "train_validation_holdout": train_validation_holdout.to_dict() if train_validation_holdout is not None else None, "cross_validation": cross_validation.to_dict() if cross_validation is not None else None, "datetime_series_col": datetime_series_col } return conf
def run_train_job(self, framework, conf: ExperimentConf, no_experiment: int, model_input_features:list, n_rows: int): # 1. create train conf job_name = f"train_job_{conf.dataset_name}_{framework}_{util.human_datetime()}" brevity_framework_dict = {FrameworkType.DeepTables: "dt", FrameworkType.GBM: "gbm"} model_name = util.model_name(conf.dataset_name, no_experiment) #f"{conf.dataset_name}_{no_experiment}" model_dir = util.model_dir(conf.dataset_name, model_name) os.makedirs(model_dir) train_source_code_path = P.join(model_dir, 'train.py') train_log = P.join(model_dir, f"train.log") train_job_conf = TrainJobConf(framework=framework, name=job_name, model_name=model_name, searcher=TrainJobConf.Searcher.MCTSSearcher, max_trails=consts.TRAIN_MODE_MAX_TRAILS_MAPPING[conf.train_mode], search_space=TrainJobConf.SearchSpace.Minimal) # 2. insert to db with db.open_session() as s: self.create_temporary_model(s, model_name, no_experiment, model_input_features, conf, train_job_conf) # 3. generate train source code train_source_code, notebook_content = self.generate_code(model_name, model_input_features, n_rows, train_job_conf, conf) with open(train_source_code_path, 'w', encoding='utf-8') as f: f.write(train_source_code) notebook_file_path = P.join(model_dir, 'train.ipynb') with open(notebook_file_path, 'w', encoding='utf-8') as f: f.write(notebook_content) # 4. run train process # Note: if plus & at end of command, the process id will be plus 1 cause a bug command = f"nohup {sys.executable} {train_source_code_path} 1>{train_log} 2>&1" log.info(f"Run train job command: \n{command}") log.info(f"Log file:\ntail -f {train_log}") log.info(f"Train source code:\n {train_source_code_path}") train_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE) with db.open_session() as s: self.model_dao.update_model_by_name(s, model_name, {"pid": train_process.pid}) return train_job_conf.to_dict()
def _handle_label_col(self, dataset_name, label_col, file_path): # calc correlation # 1. update label col, Avoiding that http request send first and not label_col not updated with db.open_session() as s: self.dataset_dao.update_by_name(s, dataset_name, {"label_col": label_col}) # 2. start a process analyze_pearson_job_name = util.analyze_data_job_name(P.basename(file_path)) std_log = P.join(util.dataset_dir(dataset_name), f"{analyze_pearson_job_name}.log") command = f"nohup {sys.executable} {util.script_path('analyze_correlation_job.py')} --dataset_name={dataset_name} --label_col={label_col} --job_name={analyze_pearson_job_name} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1" calc_correlation_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE) log.info(f"Run calculate pearson command: \n{command}") log.info(f"Log file:\ntail -f {std_log}") log.info(f"Process id is {calc_correlation_process.pid}")
def _infer_task_type(self, f:Feature): n_unique = f.unique.value if n_unique == 2: log.info(f'2 class detected, so inferred as a [binary classification] task') return TaskType.BinaryClassification else: if 'float' in f.data_type: log.info(f'Target column type is float, so inferred as a [regression] task.') return TaskType.Regression else: if n_unique > 1000: if 'int' in f.type: log.info( 'The number of classes exceeds 1000 and column type is int, so inferred as a [regression] task ') return TaskType.Regression else: raise ValueError( 'The number of classes exceeds 1000, please confirm whether your predict target is correct ') else: print(f'{n_unique} class detected, inferred as a [multiclass classification] task') return TaskType.MultiClassification
dataset_stats.has_header, dataset_stats.features_names) df = dataset_util.cast_df(df, dataset_detail['features'], True) y = df[label_col] # 4. encode y if is categorical # Do not calculate if categorical # for f in dataset_stats.features: # if f.name == label_col: # if f.type == FeatureType.Categorical: # logger.info(f"Encode label column {label_col} because type is {f.type}. ") # y = pd.Series(LabelEncoder().fit_transform(y), name=label_col) # 5. encode categorical features pearson_corr_dict = {} for f in dataset_stats.features: if f.type == FeatureType.Categorical: logger.info(f"Skip categorical feature {f.name} ") # lb = LabelEncoder() # encoded_series = pd.Series(lb.fit_transform(df[f.name]), name=f.name) # pearson_corr_dict[f.name] = y.corr(encoded_series, method='pearson') pearson_corr_dict[f.name] = None elif f.type in [FeatureType.Continuous, FeatureType.Datetime]: pearson_corr_dict[f.name] = y.corr(df[f.name], method='pearson') else: logger.info( f"Encode feature {f.name} type is {f.type}, skipped calc corr. ") pearson_corr_dict[f.name] = None # not support text feature extension = {"corr": pearson_corr_dict, "label_col": label_col} # 6. send back calc result
# 2. retrieve dataset info dataset_detail = client.retrieve_dataset(server_portal, dataset_name) dataset_stats = DatasetStats.load_dict(dataset_detail) # 3. read df df = util.read_csv(util.abs_path(dataset_stats.file_path), dataset_stats.has_header, dataset_stats.features_names) df = dataset_util.cast_df(df, dataset_detail['features'], True) y = df[label_col] # 4. encode y if is categorical for f in dataset_stats.features: if f.name == label_col: if f.type == FeatureType.Categorical: logger.info( f"Encode label column {label_col} because type is {f.type}. ") y = pd.Series(LabelEncoder().fit_transform(y), name=label_col) # 5. encode categorical features pearson_corr_dict = {} for f in dataset_stats.features: if f.type == FeatureType.Categorical: lb = LabelEncoder() logger.info(f"Encode categorical feature {f.name} ") encoded_series = pd.Series(lb.fit_transform(df[f.name]), name=f.name) pearson_corr_dict[f.name] = y.corr(encoded_series, method='pearson') elif f.type in [FeatureType.Continuous, FeatureType.Datetime]: pearson_corr_dict[f.name] = y.corr(df[f.name], method='pearson') else: logger.info( f"Encode feature {f.name} type is {f.type}, skipped calc corr. ")
def experiment(self, req_dict: dict): # 1. read params label_col = util.require_in_dict(req_dict, 'label_col', str) pos_label = util.get_from_dict(req_dict, 'pos_label', object) train_mode = util.get_from_dict(req_dict, 'train_mode', str) partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str) dataset_name = util.require_in_dict(req_dict, 'dataset_name', str) holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int) # todo check datetime_series_col datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str) experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str) if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]: raise ValueError(f"Unseen experiment_engine {experiment_engine}") # 2. check partition_strategy cross_validation = None train_validation_holdout = None if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation: cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict) n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int) if 1 < n_folds <= 50: cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage) else: raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}") elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout: train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict) train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int) validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int) if train_percentage + validation_percentage + holdout_percentage != 100: raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.") train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage) else: raise ValueError(f"Unknown partition strategy = {partition_strategy}") # 2. Retrieve data with db.open_session() as s: # 2.1. check dataset dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset is None: raise ValueError(f"Dataset={dataset_name} not exists.") dataset_stats = dataset.to_dataset_stats() # 2.2. generate new experiment name no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1 # 3. ensure dataset label is latest if dataset_stats.label_col is None: log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) if dataset_stats.label_col != label_col: log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) # 4. calc task type # 4.1. find label label_f = self._find_feature(dataset_stats.features, label_col) if label_f is None: raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .") task_type = self._infer_task_type(label_f) # 4.2. check pos_label if task_type == TaskType.BinaryClassification: if pos_label is None: raise ValueError("Pos label can not be None when it's binary-classify") else: if isinstance(pos_label, str): if len(pos_label) < 1: raise ValueError("Pos label can not be empty when it's binary-classify") # 5. run experiment if not dataset_stats.has_header: dataset_default_headers = [f.name for f in dataset_stats.features] else: dataset_default_headers = None conf = ExperimentConf(dataset_name=dataset_name, dataset_has_header=dataset_stats.has_header, dataset_default_headers=dataset_default_headers, train_mode=train_mode, label_col=label_col, pos_label=pos_label, task_type=task_type, partition_strategy=partition_strategy, cross_validation=cross_validation, train_validation_holdout=train_validation_holdout, datetime_series_col=datetime_series_col, file_path=dataset_stats.file_path) model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features))) if experiment_engine == FrameworkType.GBM: train_conf = self.run_train_job(FrameworkType.GBM, conf, no_experiment, model_input_features, dataset_stats.n_rows) else: train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features, dataset_stats.n_rows) return { "no_experiment": no_experiment, "experiment_conf": conf.to_dict(), "train_job_conf": train_conf }
def create_temporary_dataset(self, req_dict): sample_strategy = util.require_in_dict(req_dict, 'sample_strategy', str, 'random_rows') if SampleConf.Strategy.Percentage == sample_strategy: percentage = util.get_from_dict(req_dict, 'percentage', int, 30) n_rows = None elif SampleConf.Strategy.RandomRows == sample_strategy: n_rows = util.get_from_dict(req_dict, 'n_rows', int, 1000) percentage = None elif SampleConf.Strategy.WholeData == sample_strategy: n_rows = None percentage = None else: raise ValueError(f"Not support sample strategy: {sample_strategy}") upload_took = util.require_in_dict(req_dict, 'upload_took', float) file_path = util.require_in_dict(req_dict, 'file_path', str) source_type = util.require_in_dict(req_dict, 'source_type', str) sample_conf = SampleConf(sample_strategy=sample_strategy, percentage=percentage, n_rows=n_rows) # 1. validate param if source_type not in [ DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import ]: raise IllegalParamException( 'source_type', source_type, f'Should in {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}' ) if source_type == DatasetEntity.SourceType.Upload: upload_file_prefix = P.join(consts.FIELD_TMP, consts.FIELD_UPLOAD) if not file_path.startswith(upload_file_prefix): raise ValueError( f"For upload file should path should start with {upload_file_prefix} but it's {file_path}" ) else: # fix relative path file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(file_path): raise ValueError(f"File={file_path} not exists") if not P.isfile(file_path): raise ValueError(f"File={file_path} is not a file") util.validate_sample_conf(sample_conf) # 2. create if source_type == DatasetEntity.SourceType.Upload: return self._create_temporary_dataset(source_type, file_path, upload_took, sample_conf) elif source_type == DatasetEntity.SourceType.Import: t1 = time.time() internal_path = util.temporary_upload_file_path( P.basename(file_path)) os.makedirs(P.dirname(internal_path), exist_ok=True) shutil.copy(file_path, internal_path) took = time.time() - t1 logger.info(f"Copy file to {internal_path}") return self._create_temporary_dataset(source_type, internal_path, took, sample_conf) else: raise IllegalParamException( 'source_type', source_type, f'should one of {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}' )
def _create_temporary_dataset(self, source_type, file_path, took, sample_conf: SampleConf): now = util.get_now_datetime() file_name = P.basename(file_path) temporary_dataset_name = self.choose_temporary_dataset_name( file_name) # use a long name analyze_job_name = util.analyze_data_job_name( util.cut_suffix(file_name), now) file_size = P.getsize(file_path) # 2. create record td = DatasetEntity(name=temporary_dataset_name, file_size=file_size, is_temporary=True, status=DatasetEntity.Status.Created, source_type=source_type, file_path=file_path, file_name=file_name, create_datetime=now, last_update_datetime=now) with db.open_session() as s: s.add(td) # 3. send file transfer step if source_type == DatasetEntity.SourceType.Upload: step = JobStep(type=AnalyzeStep.Types.Upload, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) elif source_type == DatasetEntity.SourceType.Import: step = JobStep(type=AnalyzeStep.Types.Copy, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) # 4. create analyze config conf = AnalyzeJobConf(job_name=analyze_job_name, dataset_name=temporary_dataset_name, sample_conf=sample_conf, path=file_path, temporary_dataset=True, label_col=None) # 5. start new process analyze_config_string = util.dumps(conf.to_dict()) logger.info(f"Analyze job conf: {analyze_config_string}") python_executable = sys.executable temporary_dataset_dir = util.temporary_dataset_dir( temporary_dataset_name) os.makedirs(temporary_dataset_dir, exist_ok=True) std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log") command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {std_log}") # JobManager.instance().run_job(job) os.system(command) # ha ha ha return temporary_dataset_name, analyze_job_name