def retrieve(self, dataset_name, n_top_value): with db.open_session() as s: dataset = self.dataset_dao.require_by_name(s, dataset_name) dict_value = util.sqlalchemy_obj_to_dict(dataset) dict_value['file_path'] = util.relative_path(dataset.file_path) # if dataset.status == DatasetEntity.Status.Analyzed: # for i, f in enumerate(dict_value['features']): # if f['type'] in [FeatureType.Categorical, FeatureType.Continuous]: # if f['unique']['value'] > n_top_value: # # calc top {n_count_value} # extension = f['extension'] # sorted(extension['value_count'], key=lambda _: _['value']) # # top_value_count = extension['value_count'][: n_top_value] # remain_value_count = extension['value_count'][n_top_value:] # remain_count = 0 # for remain_dict in remain_value_count: # remain_count = remain_count + remain_dict['value'] # # top_value_count.append( # FeatureValueCount(type="Remained_SUM", value=remain_count).to_dict() # ) # dict_value['features'][i]['extension']['value_count'] = top_value_count # extension['value_count'] = top_value_count # dict_value['detail'] = dict_value['extension'] extension = dict_value.pop('extension') dict_value['extension'] = {"sample_conf": extension['sample_conf']} return dict_value
def retrieve_model(self, model_name): def _replace_NaN(v): if v is None: return v else: if math.isnan(v): return None else: return v with db.open_session() as s: model = self.require_model(s, model_name) # handle trails if model.trails is not None and len(model.trails)>0: param_names = [k for k in model.trails[0].params] trail_data_dict_list = [] trail_params_values = [] trail_index = [] for t in model.trails: param_values = [_replace_NaN(t.params.get(n)) for n in param_names] trail_params_values.append(param_values) trail_index.append(t.trail_no) df_train_params = pd.DataFrame(data=trail_params_values, columns=param_names) # remove if all is None df_train_params.dropna(axis=1, how='all', inplace=True) for i, t in enumerate(model.trails): trail_data_dict = {"reward": t.reward, "params": [_replace_NaN(_) for _ in df_train_params.iloc[i].tolist()], "elapsed": t.elapsed} trail_data_dict_list.append(trail_data_dict) if len(df_train_params.columns.values) > 0: # ensure not all params is None trails_dict = { "param_names": df_train_params.columns.tolist(), "data": trail_data_dict_list } else: trails_dict = {} else: trails_dict = {} model_dict = model.to_dict() # update trails model_dict['trails'] = trails_dict model_dict['model_path'] = util.relative_path(model_dict['model_path']) model_dict['escaped'] = model.escaped_time_by_seconds() model_dict['log_file_path'] = model.log_file_path() model_dict['train_source_code_path'] = model.train_source_code_path() model_dict['train_notebook_uri'] = model.train_notebook_uri() return model_dict
def _handle(model_dao, session, dataset: DatasetEntity): d = util.sqlalchemy_obj_to_dict(dataset) d['file_path'] = util.relative_path(dataset.file_path) d['create_datetime'] = util.to_timestamp(dataset.create_datetime) d['n_experiments'] = model_dao.query_n_experiment( session, dataset.name) del d['features'] del d['feature_summary'] del d['extension'] return d
def handle_tornado_upload_file(http_handler, tornado_http_files, upload_start_time): # 1. check and read param tornado_http_file = tornado_http_files.get("file")[0] if tornado_http_file is None: raise MissingParamException("file") file_name = tornado_http_file['filename'] file_body = tornado_http_file['body'] file_size = util.human_data_size(len(file_body)) file_suffix = util.get_file_suffix(file_name) assert file_suffix in [ '.csv', '.tsv' ], 'Please check is your file suffix in [.csv, .tsv], current is: %s' % file_suffix origin_file_name = util.make_dataset_name(util.cut_suffix( file_name)) + file_suffix # for it in url, disk path readable # 2. open temporary file and write to local file temporary_file_path = util.temporary_upload_file_path(origin_file_name) if not P.exists(P.dirname(temporary_file_path)): os.makedirs(P.dirname(temporary_file_path)) logger.info(f"Open path {temporary_file_path} to store upload file.") with open(temporary_file_path, 'wb') as f: f.write(file_body) logger.info( f"Uploaded file finished at {temporary_file_path}, file size {file_size} ." ) upload_took = util.time_diff(time.time(), upload_start_time) # 3. response # relative_path = temporary_file_path[len(consts.PATH_DATA_ROOT)+1:] # relative path not start with / response = \ { "path": util.relative_path(P.abspath(temporary_file_path)), "size": file_size, "took": upload_took } http_handler.response_json(response)
def train_notebook_uri(self): # exits begin from train start train_notebook_path = P.join(str(self.model_path), 'train.ipynb') return util.relative_path(train_notebook_path)
def train_source_code_path(self): # exits begin from train start return util.relative_path(P.join(str(self.model_path), 'train.py'))
def log_file_path(self): # exits begin from train start return util.relative_path(P.join(str(self.model_path), 'train.log'))
def preview(self, dataset_name: str, page_num: int, page_size: int) -> RespPreviewDataset: """ Args: dataset_name: page_num: start from 1 page_size: Returns: """ # 1. validation params if page_num < 1: raise ValueError("Param page_num should >= 1'") if page_size < 1: raise ValueError("Param page_size should >= 1'") # 2. retrieve dataset with db.open_session() as s: dataset = self.dataset_dao.require_by_name(s, dataset_name) file_path = dataset.file_path if not P.exists(file_path): raise FileNotFoundError(file_path) dataset_stats = dataset.to_dataset_stats() relative_file_path = util.relative_path(dataset_stats.file_path) # 3. read data dataset_headers = [f.name for f in dataset_stats.features] dataset_headers.insert(0, "No. ") # dataset_headers.insert(0, "number") if dataset_stats.has_header: iterator_df = pd.read_csv(file_path, chunksize=page_size) else: iterator_df = pd.read_csv(file_path, chunksize=page_size, header=None) # 4. seek pages, page num start from 1 # e.g. if page_num = 1 while loop will do 0 times, below code will invoke next(iterator_df) and get data current_page = 1 while current_page < page_num: try: next(iterator_df) # no Reference, will be gc current_page = current_page + 1 except StopIteration: # if page_num is too large , no data returned return RespPreviewDataset(headers=dataset_headers, rows=None, count=dataset_stats.n_rows, file_path=relative_file_path) # 5. hit data try: page_df: pd.DataFrame = next(iterator_df) # 5.1. make index start_line_no = (current_page - 1) * page_size + 1 # start from 1 df_index = page_df.index = pd.RangeIndex( start_line_no, start_line_no + page_df.shape[0]) page_df.index = df_index values = page_df.to_records(index=True).tolist() return RespPreviewDataset(headers=dataset_headers, rows=values, count=dataset_stats.n_rows, file_path=relative_file_path) except StopIteration: return RespPreviewDataset(headers=dataset_headers, rows=None, count=dataset_stats.n_rows, file_path=relative_file_path)
t_write_result_start = time.time() write_result_status = AnalyzeStep.Status.Succeed try: df = X # remained cols if reserved_cols is not None and len(reserved_cols) > 0: result_df = df_origin[reserved_cols] result_df[BATCH_PREDICTION_COL] = y_pred else: result_df = pd.DataFrame(data={BATCH_PREDICTION_COL: y_pred}) output_path = P.join(consts.PATH_TMP_PREDICT, f"{model_name}_{util.human_datetime()}.csv") if not P.exists(consts.PATH_TMP_PREDICT): os.makedirs(consts.PATH_TMP_PREDICT) result_df.to_csv(output_path, index=False) logger.info(f"Write result finished at: {output_path}") write_result_extension = {"output_path": util.relative_path(output_path)} except Exception as e: write_result_status = AnalyzeStep.Status.Failed raise e finally: client.batch_predict_callback(portal=server_portal, dataset_name=dataset_name, model_name=model_name, batch_predict_job_name=job_name, type=PredictStepType.WriteResult, status=write_result_status, took=time.time() - t_write_result_start, extension=write_result_extension)