示例#1
0
    def predict(self, dataset_name, model_name, req_dict: dict):
        # 1. read params
        file_path = util.require_in_dict(req_dict, 'file_path', str)
        reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list)
        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        if reserved_cols is None or len(reserved_cols) < 1:
            reserved_cols_str = ""
        else:
            reserved_cols_str = ",".join(reserved_cols)

        # 2.  check params
        with db.open_session() as s:
            self.model_dao.require_by_name(s, model_name).to_model_bean()
            dataset_stats = self.dataset_dao.require_by_name(
                s, dataset_name).to_dataset_stats()
        predict_job_name = util.predict_job_name(dataset_name)
        abs_file_path = P.join(consts.DATA_DIR, file_path)
        if not P.exists(abs_file_path):
            raise ValueError(f"Input file not exists: {abs_file_path}")
        if not P.isfile(abs_file_path):
            raise ValueError(f"Input file is not file: {abs_file_path}")

        # 3. add upload step
        upload_extension = {"file_size": P.getsize(abs_file_path)}
        upload_step = JobStep(type=PredictStepType.Upload,
                              status=JobStep.Status.Succeed,
                              took=upload_took,
                              datetime=util.get_now_long(),
                              extension=upload_extension)
        self.add_predict_process_step(model_name, predict_job_name,
                                      upload_step)

        # 4.  execute command
        model_dir = util.model_dir(dataset_name, model_name)
        predict_log_path = P.join(model_dir, f"{predict_job_name}.log")
        if not dataset_stats.has_header:
            default_headers = ",".join(
                [f.name for f in dataset_stats.features])
        else:
            default_headers = None

        command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers}  --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {predict_log_path}")
        os.system(command)  # ha ha ha

        return predict_job_name
示例#2
0
def callback(url, type, status, took, extension, **kwargs):
    req_body_dict = \
        {
            "type": type,
            "status": status,
            "took": took,
            "datetime": util.get_now_long(),
            "extension": extension
        }

    req_body = util.dumps(req_body_dict)
    logger.info(f"Send process event: \n{url}\n{req_body}")
    # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1
    response = requests.post(url,
                             data=req_body.encode('utf-8'),
                             timeout=TIMEOUT,
                             headers=HEADERS)
    _checkout_response_json(response)
示例#3
0
    def _create_temporary_dataset(self, source_type, file_path, took,
                                  sample_conf: SampleConf):
        now = util.get_now_datetime()
        file_name = P.basename(file_path)
        temporary_dataset_name = self.choose_temporary_dataset_name(
            file_name)  # use a long name
        analyze_job_name = util.analyze_data_job_name(
            util.cut_suffix(file_name), now)
        file_size = P.getsize(file_path)

        # 2. create record
        td = DatasetEntity(name=temporary_dataset_name,
                           file_size=file_size,
                           is_temporary=True,
                           status=DatasetEntity.Status.Created,
                           source_type=source_type,
                           file_path=file_path,
                           file_name=file_name,
                           create_datetime=now,
                           last_update_datetime=now)
        with db.open_session() as s:
            s.add(td)

        # 3. send  file transfer step
        if source_type == DatasetEntity.SourceType.Upload:
            step = JobStep(type=AnalyzeStep.Types.Upload,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)
        elif source_type == DatasetEntity.SourceType.Import:
            step = JobStep(type=AnalyzeStep.Types.Copy,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)

        # 4. create analyze config
        conf = AnalyzeJobConf(job_name=analyze_job_name,
                              dataset_name=temporary_dataset_name,
                              sample_conf=sample_conf,
                              path=file_path,
                              temporary_dataset=True,
                              label_col=None)

        # 5. start new process
        analyze_config_string = util.dumps(conf.to_dict())
        logger.info(f"Analyze job conf: {analyze_config_string}")

        python_executable = sys.executable

        temporary_dataset_dir = util.temporary_dataset_dir(
            temporary_dataset_name)

        os.makedirs(temporary_dataset_dir, exist_ok=True)

        std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log")

        command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {std_log}")

        # JobManager.instance().run_job(job)
        os.system(command)  # ha ha ha

        return temporary_dataset_name, analyze_job_name