def get_explain_random_forests_cmd(datasource, model_name, data_table, result_table, label_column): """Get PAI random forest explanation command Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: result table name label_column: name of the label column Returns: a PAI cmd to explain the data using given model """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: raise SQLFlowDiagnostic("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) # drop result table if exists db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table) schema = db.get_table_schema(conn, data_table) fields = [f[0] for f in schema if f[0] != label_column] return ('''pai -name feature_importance -project algo_public ''' '''-DmodelName="%s" -DinputTableName="%s" ''' '''-DoutputTableName="%s" -DlabelColName="%s" ''' '''-DfeatureColNames="%s" ''') % (model_name, data_table, result_table, label_column, ",".join(fields))
def get_explain_random_forest_pai_cmd(datasource, model_name, data_table, result_table, label_column): """Get a command to submit a PAI RandomForest explain task Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: name of the result table, PAI will automatically create this table label_column: name of the label column Returns: A string which is a PAI cmd """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: return ("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) schema = db.get_table_schema(conn, data_table) columns = [f[0] for f in schema] db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table) return ( """pai -name feature_importance -project algo_public """ """-DmodelName="%s" -DinputTableName="%s" -DoutputTableName="%s" """ """-DlabelColName="%s" -DfeatureColNames="%s" """ ) % (model_name, data_table, result_table, label_column, ",".join(columns))
def drop_tables(tables, datasource): """Drop given tables in datasource""" conn = db.connect_with_data_source(datasource) try: for table in tables: if table != "": drop_sql = "DROP TABLE IF EXISTS %s" % table db.execute(conn, drop_sql) except: # noqa: E722 # odps will clear table itself, so even fail here, we do # not need to raise error print("Encounter error on drop tmp table")
def create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column): """Create explain result table from given datasource Args: datasource: current datasource data_table: input data table name result_table: table name to store the result model_type: type of the model to use estimator: estimator class if the model is TensorFlow estimator label_column: column name of the predict label """ conn = db.connect_with_data_source(datasource) drop_stmt = "DROP TABLE IF EXISTS %s" % result_table db.execute(conn, drop_stmt) create_stmt = "" if model_type == EstimatorType.PAIML: return elif model_type == EstimatorType.TENSORFLOW: if estimator.startswith("BoostedTrees"): column_def = "" if conn.driver == "mysql": column_def = "(feature VARCHAR(255), dfc FLOAT, gain FLOAT)" else: # Hive & MaxCompute column_def = "(feature STRING, dfc STRING, gain STRING)" create_stmt = "CREATE TABLE IF NOT EXISTS %s %s;" % (result_table, column_def) else: if not label_column: raise SQLFlowDiagnostic( "need to specify WITH label_col=lable_col_name " "when explaining deep models") create_stmt = get_create_shap_result_sql(conn, data_table, result_table, label_column) elif model_type == EstimatorType.XGBOOST: if not label_column: raise SQLFlowDiagnostic( "need to specify WITH label_col=lable_col_name " "when explaining xgboost models") create_stmt = get_create_shap_result_sql(conn, data_table, result_table, label_column) else: raise SQLFlowDiagnostic( "not supported modelType %d for creating Explain result table" % model_type) if not db.execute(conn, create_stmt): raise SQLFlowDiagnostic("Can't create explain result table")
def create_evaluate_result_table(datasource, result_table, metrics): """Create a table to hold the evaluation result Args: datasource: current datasource result_table: the table name to save result metrics: list of evaluation metrics names """ drop_tables([result_table], datasource) # Always add loss ext_metrics = ["loss"] if isinstance(metrics, list): ext_metrics.extend(metrics) fields = ["%s STRING" % m for m in ext_metrics] sql = "CREATE TABLE IF NOT EXISTS %s (%s);" % (result_table, ",".join(fields)) conn = db.connect_with_data_source(datasource) db.execute(conn, sql)
def get_train_kmeans_pai_cmd(datasource, model_name, data_table, model_attrs, feature_column_names): """Get a command to submit a KMeans training task to PAI Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name model_attrs: model attributes for KMeans feature_column_names: names of feature columns Returns: A string which is a PAI cmd """ [ model_attrs.update({k: v}) for k, v in default_attrs.items() if k not in model_attrs ] center_count = model_attrs["center_count"] idx_table_name = model_attrs["idx_table_name"] if not idx_table_name: raise SQLFlowDiagnostic("Need to set idx_table_name in WITH clause") exclude_columns = model_attrs["excluded_columns"].split(",") # selectedCols indicates feature columns used to clustering selected_cols = [ fc for fc in feature_column_names if fc not in exclude_columns ] conn = db.connect_with_data_source(datasource) db.execute(conn, "DROP TABLE IF EXISTS %s" % idx_table_name) return ( """pai -name kmeans -project algo_public """ """-DinputTableName=%s -DcenterCount=%d -DmodelName %s """ """-DidxTableName=%s -DselectedColNames="%s" -DappendColNames="%s" """ ) % (data_table, center_count, model_name, idx_table_name, ",".join(selected_cols), ",".join(feature_column_names))
def create_tmp_table_from_select(select, datasource): """Create temp table for given select query Args: select: string, the selection statement datasource: string, the datasource to connect """ if not select: return None conn = db.connect_with_data_source(datasource) project = get_project(datasource) tmp_tb_name = gen_rand_string() create_sql = "CREATE TABLE %s LIFECYCLE %s AS %s" % ( tmp_tb_name, LIFECYCLE_ON_TMP_TABLE, select) # (NOTE: lhw) maxcompute conn doesn't support close # we should unify db interface if not db.execute(conn, create_sql): raise SQLFlowDiagnostic("Can't crate tmp table for %s" % select) return "%s.%s" % (project, tmp_tb_name)
def create_predict_result_table(datasource, select, result_table, label_column, train_label_column, model_type): """Create predict result table with given name and label column Args: datasource: current datasource select: sql statement to get prediction data set result_table: the table name to save result label_column: name of the label column, if not exist in select result, we will add a int column in the result table train_label_column: name of the label column when training model_type: type of model defined in runtime.oss """ conn = db.connect_with_data_source(datasource) db.execute(conn, "DROP TABLE IF EXISTS %s" % result_table) # PAI ml will create result table itself if model_type == EstimatorType.PAIML: return create_table_sql = "CREATE TABLE %s AS SELECT * FROM %s LIMIT 0" % ( result_table, select) db.execute(conn, create_table_sql) # if label is not in data table, add a int column for it schema = db.get_table_schema(conn, result_table) col_type = "INT" for (name, ctype) in schema: if name == train_label_column or name == label_column: col_type = ctype break col_names = [col[0] for col in schema] if label_column not in col_names: db.execute( conn, "ALTER TABLE %s ADD %s %s" % (result_table, label_column, col_type)) if train_label_column != label_column and train_label_column in col_names: db.execute( conn, "ALTER TABLE %s DROP COLUMN %s" % (result_table, train_label_column))