示例#1
0
    def retrieve(self, dataset_name, n_top_value):
        with db.open_session() as s:
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            dict_value = util.sqlalchemy_obj_to_dict(dataset)
            dict_value['file_path'] = util.relative_path(dataset.file_path)
            # if dataset.status == DatasetEntity.Status.Analyzed:
            #     for i, f in enumerate(dict_value['features']):
            #         if f['type'] in [FeatureType.Categorical, FeatureType.Continuous]:
            #             if f['unique']['value'] > n_top_value:
            #                 # calc top {n_count_value}
            #                 extension = f['extension']
            #                 sorted(extension['value_count'], key=lambda _: _['value'])
            #
            #                 top_value_count = extension['value_count'][: n_top_value]
            #                 remain_value_count = extension['value_count'][n_top_value:]
            #                 remain_count = 0
            #                 for remain_dict in remain_value_count:
            #                     remain_count = remain_count + remain_dict['value']
            #
            #                 top_value_count.append(
            #                     FeatureValueCount(type="Remained_SUM", value=remain_count).to_dict()
            #                 )
            #                 dict_value['features'][i]['extension']['value_count'] = top_value_count
            # extension['value_count'] = top_value_count

            # dict_value['detail'] = dict_value['extension']
            extension = dict_value.pop('extension')
            dict_value['extension'] = {"sample_conf": extension['sample_conf']}
            return dict_value
示例#2
0
    def retrieve_model(self, model_name):

        def _replace_NaN(v):
            if v is None:
                return v
            else:
                if math.isnan(v):
                    return None
                else:
                    return v

        with db.open_session() as s:
            model = self.require_model(s, model_name)

        # handle trails
        if model.trails is not None and len(model.trails)>0:
            param_names = [k for k in model.trails[0].params]
            trail_data_dict_list = []

            trail_params_values = []
            trail_index = []
            for t in model.trails:
                param_values = [_replace_NaN(t.params.get(n)) for n in param_names]
                trail_params_values.append(param_values)
                trail_index.append(t.trail_no)
            df_train_params = pd.DataFrame(data=trail_params_values, columns=param_names)
            # remove if all is None
            df_train_params.dropna(axis=1, how='all', inplace=True)

            for i, t in enumerate(model.trails):
                trail_data_dict = {"reward": t.reward, "params": [_replace_NaN(_) for _ in df_train_params.iloc[i].tolist()], "elapsed": t.elapsed}
                trail_data_dict_list.append(trail_data_dict)

            if len(df_train_params.columns.values) > 0:  # ensure not all params is None
                trails_dict = {
                    "param_names": df_train_params.columns.tolist(),
                    "data": trail_data_dict_list
                }
            else:
                trails_dict = {}
        else:
            trails_dict = {}

        model_dict = model.to_dict()
        # update trails
        model_dict['trails'] = trails_dict
        model_dict['model_path'] = util.relative_path(model_dict['model_path'])
        model_dict['escaped'] = model.escaped_time_by_seconds()
        model_dict['log_file_path'] = model.log_file_path()
        model_dict['train_source_code_path'] = model.train_source_code_path()
        model_dict['train_notebook_uri'] = model.train_notebook_uri()

        return model_dict
示例#3
0
        def _handle(model_dao, session, dataset: DatasetEntity):
            d = util.sqlalchemy_obj_to_dict(dataset)
            d['file_path'] = util.relative_path(dataset.file_path)
            d['create_datetime'] = util.to_timestamp(dataset.create_datetime)
            d['n_experiments'] = model_dao.query_n_experiment(
                session, dataset.name)

            del d['features']
            del d['feature_summary']
            del d['extension']

            return d
示例#4
0
def handle_tornado_upload_file(http_handler, tornado_http_files,
                               upload_start_time):
    # 1. check and read param
    tornado_http_file = tornado_http_files.get("file")[0]

    if tornado_http_file is None:
        raise MissingParamException("file")

    file_name = tornado_http_file['filename']
    file_body = tornado_http_file['body']
    file_size = util.human_data_size(len(file_body))
    file_suffix = util.get_file_suffix(file_name)

    assert file_suffix in [
        '.csv', '.tsv'
    ], 'Please check is your file suffix in [.csv, .tsv], current is: %s' % file_suffix

    origin_file_name = util.make_dataset_name(util.cut_suffix(
        file_name)) + file_suffix  # for it in url, disk path readable

    # 2. open temporary file and  write to local file
    temporary_file_path = util.temporary_upload_file_path(origin_file_name)

    if not P.exists(P.dirname(temporary_file_path)):
        os.makedirs(P.dirname(temporary_file_path))

    logger.info(f"Open path {temporary_file_path} to store upload file.")

    with open(temporary_file_path, 'wb') as f:
        f.write(file_body)
    logger.info(
        f"Uploaded file finished at {temporary_file_path}, file size {file_size} ."
    )

    upload_took = util.time_diff(time.time(), upload_start_time)

    # 3. response
    # relative_path = temporary_file_path[len(consts.PATH_DATA_ROOT)+1:]  # relative path not start with /
    response = \
        {
            "path": util.relative_path(P.abspath(temporary_file_path)),
            "size": file_size,
            "took": upload_took
        }
    http_handler.response_json(response)
示例#5
0
 def train_notebook_uri(self):
     # exits begin from train start
     train_notebook_path = P.join(str(self.model_path), 'train.ipynb')
     return util.relative_path(train_notebook_path)
示例#6
0
 def train_source_code_path(self):
     # exits begin from train start
     return util.relative_path(P.join(str(self.model_path), 'train.py'))
示例#7
0
 def log_file_path(self):
     # exits begin from train start
     return util.relative_path(P.join(str(self.model_path), 'train.log'))
示例#8
0
    def preview(self, dataset_name: str, page_num: int,
                page_size: int) -> RespPreviewDataset:
        """
        Args:
            dataset_name:
            page_num: start from 1
            page_size:

        Returns:

        """
        # 1. validation params
        if page_num < 1:
            raise ValueError("Param page_num should >= 1'")

        if page_size < 1:
            raise ValueError("Param page_size should >= 1'")

        # 2. retrieve dataset
        with db.open_session() as s:
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            file_path = dataset.file_path
            if not P.exists(file_path):
                raise FileNotFoundError(file_path)
            dataset_stats = dataset.to_dataset_stats()

        relative_file_path = util.relative_path(dataset_stats.file_path)

        # 3. read data
        dataset_headers = [f.name for f in dataset_stats.features]
        dataset_headers.insert(0, "No. ")
        # dataset_headers.insert(0, "number")
        if dataset_stats.has_header:
            iterator_df = pd.read_csv(file_path, chunksize=page_size)
        else:
            iterator_df = pd.read_csv(file_path,
                                      chunksize=page_size,
                                      header=None)

        # 4. seek pages, page num start from 1
        # e.g. if page_num = 1 while loop will do 0 times, below code will invoke next(iterator_df) and get data
        current_page = 1
        while current_page < page_num:
            try:
                next(iterator_df)  # no Reference, will be gc
                current_page = current_page + 1
            except StopIteration:
                # if page_num is too large , no data returned
                return RespPreviewDataset(headers=dataset_headers,
                                          rows=None,
                                          count=dataset_stats.n_rows,
                                          file_path=relative_file_path)

        # 5. hit data
        try:
            page_df: pd.DataFrame = next(iterator_df)
            # 5.1. make index
            start_line_no = (current_page - 1) * page_size + 1  # start from 1
            df_index = page_df.index = pd.RangeIndex(
                start_line_no, start_line_no + page_df.shape[0])
            page_df.index = df_index
            values = page_df.to_records(index=True).tolist()
            return RespPreviewDataset(headers=dataset_headers,
                                      rows=values,
                                      count=dataset_stats.n_rows,
                                      file_path=relative_file_path)
        except StopIteration:
            return RespPreviewDataset(headers=dataset_headers,
                                      rows=None,
                                      count=dataset_stats.n_rows,
                                      file_path=relative_file_path)
示例#9
0
t_write_result_start = time.time()
write_result_status = AnalyzeStep.Status.Succeed
try:
    df = X  # remained cols
    if reserved_cols is not None and len(reserved_cols) > 0:
        result_df = df_origin[reserved_cols]
        result_df[BATCH_PREDICTION_COL] = y_pred
    else:
        result_df = pd.DataFrame(data={BATCH_PREDICTION_COL: y_pred})

    output_path = P.join(consts.PATH_TMP_PREDICT,
                         f"{model_name}_{util.human_datetime()}.csv")
    if not P.exists(consts.PATH_TMP_PREDICT):
        os.makedirs(consts.PATH_TMP_PREDICT)

    result_df.to_csv(output_path, index=False)
    logger.info(f"Write result finished at: {output_path}")
    write_result_extension = {"output_path": util.relative_path(output_path)}
except Exception as e:
    write_result_status = AnalyzeStep.Status.Failed
    raise e
finally:
    client.batch_predict_callback(portal=server_portal,
                                  dataset_name=dataset_name,
                                  model_name=model_name,
                                  batch_predict_job_name=job_name,
                                  type=PredictStepType.WriteResult,
                                  status=write_result_status,
                                  took=time.time() - t_write_result_start,
                                  extension=write_result_extension)