def get_sample(src: DataItem, sample: int, label: str, reader=None): """generate data sample to be split (candidate for mlrun) Returns features matrix and header (x), and labels (y) :param src: data artifact :param sample: sample size from data source, use negative integers to sample randomly, positive to sample consecutively from the first row :param label: label column title """ table = src.as_df() # get sample if (sample == -1) or (sample >= 1): # get all rows, or contiguous sample starting at row 1. raw = table.dropna() labels = raw.pop(label) raw = raw.iloc[:sample, :] labels = labels.iloc[:sample] else: # grab a random sample raw = table.dropna().sample(sample * -1) labels = raw.pop(label) return raw, labels, raw.columns.values
def xgb_test( context, models_path: DataItem, test_set: DataItem, label_column: str, plots_dest: str = "plots", default_model: str = "model.pkl", ) -> None: """Test one or more classifier models against held-out dataset Using held-out test features, evaluates the peformance of the estimated model Can be part of a kubeflow pipeline as a test step that is run post EDA and training/validation cycles :param context: the function context :param models_path: model artifact to be tested :param test_set: test features and labels :param label_column: column name for ground truth labels :param plots_dest: dir for test plots :param default_model: 'model.pkl', default model artifact file name """ xtest = test_set.as_df() ytest = xtest.pop(label_column) try: model_file, model_obj, _ = get_model(models_path.url, suffix=".pkl") model_obj = load(open(model_file, "rb")) except Exception as a: raise Exception("model location likely misspecified") eval_metrics = eval_model_v2(context, xtest, ytest.values, model_obj)
def test_classifier( context, models_path: DataItem, test_set: DataItem, label_column: str, score_method: str = "micro", plots_dest: str = "", model_evaluator=None, default_model: str = "model.pkl", predictions_column: str = "yscore", model_update=True, ) -> None: """Test one or more classifier models against held-out dataset Using held-out test features, evaluates the peformance of the estimated model Can be part of a kubeflow pipeline as a test step that is run post EDA and training/validation cycles :param context: the function context :param models_path: artifact models representing a file or a folder :param test_set: test features and labels :param label_column: column name for ground truth labels :param score_method: for multiclass classification :param plots_dest: dir for test plots :param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string or available in this folder :param predictions_column: column name for the predictions column on the resulted artifact :param model_update: (True) update model, when running as stand alone no need in update """ xtest = test_set.as_df() ytest = xtest.pop(label_column) try: model_file, model_obj, _ = get_model(models_path, suffix=".pkl") model_obj = load(open(model_file, "rb")) except Exception as a: raise Exception("model location likely specified") extra_data = eval_model_v2(context, xtest, ytest.values, model_obj) if model_obj and model_update == True: update_model( models_path, extra_data=extra_data, metrics=context.results, key_prefix="validation-", ) y_hat = model_obj.predict(xtest) if y_hat.ndim == 1 or y_hat.shape[1] == 1: score_names = [predictions_column] else: score_names = [ f"{predictions_column}_" + str(x) for x in range(y_hat.shape[1]) ] df = pd.concat( [xtest, ytest, pd.DataFrame(y_hat, columns=score_names)], axis=1) context.log_dataset("test_set_preds", df=df, format="parquet", index=False)
def cox_test( context, models_path: DataItem, test_set: DataItem, label_column: str, plots_dest: str = "plots", model_evaluator=None, ) -> None: """Test one or more classifier models against held-out dataset Using held-out test features, evaluates the peformance of the estimated model Can be part of a kubeflow pipeline as a test step that is run post EDA and training/validation cycles :param context: the function context :param model_file: model artifact to be tested :param test_set: test features and labels :param label_column: column name for ground truth labels :param score_method: for multiclass classification :param plots_dest: dir for test plots :param model_evaluator: WIP: specific method to generate eval, passed in as string or available in this folder """ xtest = test_set.as_df() ytest = xtest.pop(label_column) model_file, model_obj, _ = get_model(models_path.url, suffix=".pkl") model_obj = load(open(str(model_file), "rb")) try: if not model_evaluator: eval_metrics = eval_class_model(context, xtest, ytest, model_obj) model_plots = eval_metrics.pop("plots") model_tables = eval_metrics.pop("tables") for plot in model_plots: context.log_artifact(plot, local_path=f"{plots_dest}/{plot.key}.html") for tbl in model_tables: context.log_artifact(tbl, local_path=f"{plots_dest}/{plot.key}.csv") context.log_results(eval_metrics) except: context.log_dataset("cox-test-summary", df=model_obj.summary, index=True, format="csv") context.logger.info("cox tester not implemented")
def open_archive( context: MLClientCtx, archive_url: DataItem, subdir: str = "content", key: str = "content", target_path: str = None, ): """Open a file/object archive into a target directory Currently supports zip and tar.gz :param context: function execution context :param archive_url: url of archive file :param subdir: path within artifact store where extracted files are stored :param key: key of archive contents in artifact store :param target_path: file system path to store extracted files (use either this or subdir) """ os.makedirs(target_path or subdir, exist_ok=True) archive_url = archive_url.local() if archive_url.endswith("gz"): with tarfile.open(archive_url, mode="r|gz") as ref: ref.extractall(target_path or subdir) elif archive_url.endswith("zip"): with zipfile.ZipFile(archive_url, "r") as ref: ref.extractall(target_path or subdir) else: raise ValueError(f"unsupported archive type in {archive_url}") kwargs = {} if target_path: kwargs = {"target_path": target_path} else: kwargs = {"local_path": subdir} context.log_artifact(key, **kwargs)
def load_dask( context: MLClientCtx, src_data: DataItem, dask_key: str = "dask_key", inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, dask_persist: bool = True, refresh_data: bool = True, scheduler_key: str = "scheduler" ) -> None: """Load dataset into an existing dask cluster dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected. :param context: the function context :param src_data: url of the data file or partitioned dataset as either artifact DataItem, string, or path object (similar to pandas read_csv) :param dask_key: destination key of data on dask cluster and artifact store :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param dask_persist: (True) should the data be persisted (through the `client.persist` op) :param refresh_data: (False) if the dask_key already exists in the dask cluster, this will raise an Exception. Set to True to replace the existing cluster data. :param scheduler_key: (scheduler) the dask scheduler configuration, json also logged as an artifact """ if hasattr(context, "dask_client"): dask_client = context.dask_client else: raise Exception("a dask client was not found in the execution context") df = src_data.as_df(df_module=dd) if dask_persist: df = dask_client.persist(df) if dask_client.datasets and dask_key in dask_client.datasets: dask_client.unpublish_dataset(dask_key) dask_client.publish_dataset(df, name=dask_key) if context: context.dask_client = dask_client # share the scheduler, whether data is persisted or not dask_client.write_scheduler_file(scheduler_key + ".json") # we don't use log_dataset here until it can take into account # dask origin and apply dask describe. context.log_artifact(scheduler_key, local_path=scheduler_key + ".json")
def pandas_profiling_report( context: MLClientCtx, data: DataItem, ) -> None: """Create a Pandas Profiling Report for a dataset. :param context: the function context :param data: Dataset to create report for """ df = data.as_df() profile = df.profile_report(title="Pandas Profiling Report") context.log_artifact( "Pandas Profiling Report", body=profile.to_html(), local_path="pandas_profiling_report.html", )
def model_server_tester(context, table: DataItem, addr: str, label_column: str = "label", model: str = '', match_err: bool = False, rows: int = 20): """ Test a model server :param table: csv/parquet table with test data :param addr: function address/url :param label_column: name of the label column in table :param model: tested model name :param match_err: raise error on validation (require proper test set) :param rows: number of rows to use from test set """ table = table.as_df() y_list = table.pop(label_column).values.tolist() context.logger.info(f'testing with dataset against {addr}, model: {model}') if rows and rows < table.shape[0]: table = table.sample(rows) count = err_count = match = 0 times = [] for x, y in zip(table.values, y_list): count += 1 event_data = json.dumps({"inputs": [x.tolist()]}) had_err = False try: start = datetime.now() resp = requests.put(f'{addr}/v2/models/{model}/infer', json=event_data) if not resp.ok: context.logger.error(f'bad function resp!!\n{resp.text}') err_count += 1 continue times.append((datetime.now() - start).microseconds) except OSError as err: context.logger.error( f'error in request, data:{event_data}, error: {err}') err_count += 1 continue resp_data = resp.json() print(resp_data) y_resp = resp_data['outputs'][0] if y == y_resp: match += 1 context.log_result('total_tests', count) context.log_result('errors', err_count) context.log_result('match', match) if count - err_count > 0: times_arr = np.array(times) context.log_result('avg_latency', int(np.mean(times_arr))) context.log_result('min_latency', int(np.amin(times_arr))) context.log_result('max_latency', int(np.amax(times_arr))) chart = ChartArtifact('latency', header=['Test', 'Latency (microsec)']) for i in range(len(times)): chart.add_row([i + 1, int(times[i])]) context.log_artifact(chart) context.logger.info( f'run {count} tests, {err_count} errors and {match} match expected value' ) if err_count: raise ValueError(f'failed on {err_count} tests of {count}') if match_err and match != count: raise ValueError(f'only {match} results match out of {count}')
def summarize( context: MLClientCtx, table: DataItem, label_column: str = None, class_labels: List[str] = [], plot_hist: bool = True, plots_dest: str = "plots", update_dataset=False, ) -> None: """Summarize a table :param context: the function context :param table: MLRun input pointing to pandas dataframe (csv/parquet file path) :param label_column: ground truth column label :param class_labels: label for each class in tables and plots :param plot_hist: (True) set this to False for large tables :param plots_dest: destination folder of summary plots (relative to artifact_path) :param update_dataset: when the table is a registered dataset update the charts in-place """ df = table.as_df() header = df.columns.values extra_data = {} try: gcf_clear(plt) snsplt = sns.pairplot(df, hue=label_column) # , diag_kws={"bw": 1.5}) extra_data["histograms"] = context.log_artifact( PlotArtifact("histograms", body=plt.gcf()), local_path=f"{plots_dest}/hist.html", db_key=False, ) except Exception as e: context.logger.error( f"Failed to create pairplot histograms due to: {e}") try: gcf_clear(plt) plot_cols = 3 plot_rows = int((len(header) - 1) / plot_cols) + 1 fig, ax = plt.subplots(plot_rows, plot_cols, figsize=(15, 4)) fig.tight_layout(pad=2.0) for i in range(plot_rows * plot_cols): if i < len(header): sns.violinplot( x=df[header[i]], ax=ax[int(i / plot_cols)][i % plot_cols], orient="h", width=0.7, inner="quartile", ) else: fig.delaxes(ax[int(i / plot_cols)][i % plot_cols]) i += 1 extra_data["violin"] = context.log_artifact( PlotArtifact("violin", body=plt.gcf(), title="Violin Plot"), local_path=f"{plots_dest}/violin.html", db_key=False, ) except Exception as e: context.logger.warn( f"Failed to create violin distribution plots due to: {e}") if label_column: labels = df.pop(label_column) imbtable = labels.value_counts(normalize=True).sort_index() try: gcf_clear(plt) balancebar = imbtable.plot(kind="bar", title="class imbalance - labels") balancebar.set_xlabel("class") balancebar.set_ylabel("proportion of total") extra_data["imbalance"] = context.log_artifact( PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html", ) except Exception as e: context.logger.warn( f"Failed to create class imbalance plot due to: {e}") context.log_artifact( TableArtifact("imbalance-weights-vec", df=pd.DataFrame({"weights": imbtable})), local_path=f"{plots_dest}/imbalance-weights-vec.csv", db_key=False, ) tblcorr = df.corr() mask = np.zeros_like(tblcorr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True dfcorr = pd.DataFrame(data=tblcorr, columns=header, index=header) dfcorr = dfcorr[ np.arange(dfcorr.shape[0])[:, None] > np.arange(dfcorr.shape[1])] context.log_artifact( TableArtifact("correlation-matrix", df=tblcorr, visible=True), local_path=f"{plots_dest}/correlation-matrix.csv", db_key=False, ) try: gcf_clear(plt) ax = plt.axes() sns.heatmap(tblcorr, ax=ax, mask=mask, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") extra_data["correlation"] = context.log_artifact( PlotArtifact("correlation", body=plt.gcf(), title="Correlation Matrix"), local_path=f"{plots_dest}/corr.html", db_key=False, ) except Exception as e: context.logger.warn( f"Failed to create features correlation plot due to: {e}") gcf_clear(plt) if update_dataset and table.meta and table.meta.kind == "dataset": from mlrun.artifacts import update_dataset_meta update_dataset_meta(table.meta, extra_data=extra_data)
def describe_spark(context: MLClientCtx, dataset: DataItem, artifact_path, bins: int = 30, describe_extended: bool = True): location = dataset.local() spark = SparkSession.builder.appName("Spark job").getOrCreate() df = spark.read.csv(location, header=True, inferSchema=True) kwargs = [] float_cols = [ item[0] for item in df.dtypes if item[1].startswith('float') or item[1].startswith('double') ] if describe_extended == True: table, variables, freq = describe(df, bins, float_cols, kwargs) tbl_1 = variables.reset_index() if len(freq) != 0: tbl_2 = pd.DataFrame.from_dict( freq, orient="index").sort_index().stack().reset_index() tbl_2.columns = ['col', 'key', 'val'] tbl_2['Merged'] = [{ key: val } for key, val in zip(tbl_2.key, tbl_2.val)] tbl_2 = tbl_2.groupby( 'col', as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']] summary = pd.merge(tbl_1, tbl_2, how='left', left_on='index', right_on='col') else: summary = tbl_1 context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) context.log_results(table) else: tbl_1 = df.describe().toPandas() summary = tbl_1.T context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) spark.stop()
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
def permutation_importance( context: MLClientCtx, model: DataItem, dataset: DataItem, labels: str, figsz=(10, 5), plots_dest: str = "plots", fitype: str = "permute", ) -> pd.DataFrame: """calculate change in metric type 'permute' uses a pre-estimated model type 'dropcol' uses a re-estimates model :param context: the function's execution context :param model: a trained model :param dataset: features and ground truths, regression targets :param labels name of the ground truths column :param figsz: matplotlib figure size :param plots_dest: path within artifact store : """ model_file, model_data, _ = get_model(model.url, suffix=".pkl") model = load(open(str(model_file), "rb")) X = dataset.as_df() y = X.pop(labels) header = X.columns metric = _oob_classifier_accuracy baseline = metric(model, X, y) imp = [] for col in X.columns: if fitype is "permute": save = X[col].copy() X[col] = np.random.permutation(X[col]) m = metric(model, X, y) X[col] = save imp.append(baseline - m) elif fitype is "dropcol": X_ = X.drop(col, axis=1) model_ = clone(model) #model_.random_state = random_state model_.fit(X_, y) o = model_.oob_score_ imp.append(baseline - o) else: raise ValueError( "unknown fitype, only 'permute' or 'dropcol' permitted") zipped = zip(imp, header) feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) feature_imp.sort_values(by="importance", ascending=False, inplace=True) plt.clf() plt.figure(figsize=figsz) sns.barplot(x="importance", y="feature", data=feature_imp) plt.title(f"feature importances-{fitype}") plt.tight_layout() context.log_artifact( PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), local_path=f"{plots_dest}/feature-permutations.html", ) context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
def data_clean(context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data"): df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations old_cols = df.columns rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels" } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] tenure = df.tenure.copy(deep=True) df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact("preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels" ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model("model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl")
def data_clean( context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data", ): """process a raw churn data file Data has 3 states here: `raw`, `cleaned` and `encoded` * `raw` kept by default, the pipeline begins with a raw data artifact * `cleaned` kept for charts, presentations * `encoded` is input for a cross validation and training function steps (not necessarily in correct order, some parallel) * column name maps * deal with nans and other types of missings/junk * label encode binary and ordinal category columns * create category ranges from numerical columns And finally, * test Why we don't one-hot-encode here? One hot encoding isn't a necessary step for all algorithms. It can also generate a very large feature matrix that doesn't need to be serialized (even if sparse). So we leave one-hot-encoding for the training step. What about scaling numerical columns? Same as why we don't one hot encode here. Do we scale before train-test split? IMHO, no. Scaling before splitting introduces a type of data leakage. In addition, many estimators are completely immune to the monotonic transformations implied by scaling, so why waste the cycles? TODO: * parallelize where possible * more abstraction (more parameters, chain sklearn transformers) * convert to marketplace function :param context: the function execution context :param src: an artifact or file path :param file_ext: file type for artifacts :param models_dest: label encoders and other preprocessing steps should be saved together with other pipeline models :param cleaned_key: key of cleaned data table in artifact store :param encoded_key: key of encoded data table in artifact store """ df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels", } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) # VALUE transformations # clean # truncate reply to "No" df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact( "preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp, ) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) # label encoding - generate model for each column saved in dict # some of these columns may be hot encoded in the training step fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels", ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model( "model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl", )
def arc_to_parquet(context: MLClientCtx, archive_url: DataItem, header: List[str] = [None], chunksize: int = 0, dtype=None, encoding: str = "latin-1", key: str = "data", dataset: str = "None", part_cols=[], file_ext: str = "parquet", index: bool = False, refresh_data: bool = False, stats: bool = False) -> None: """Open a file/object archive and save as a parquet file or dataset Notes ----- * this function is typically for large files, please be sure to check all settings * partitioning requires precise specification of column types. * the archive_url can be any file readable by pandas read_csv, which includes tar files * if the `dataset` parameter is not empty, then a partitioned dataset will be created instead of a single file in the folder `dataset` * if a key exists already then it will not be re-acquired unless the `refresh_data` param is set to `True`. This is in case the original file is corrupt, or a refresh is required. :param context: the function context :param archive_url: MLRun data input (DataItem object) :param chunksize: (0) when > 0, row size (chunk) to retrieve per iteration :param dtype destination data type of specified columns :param encoding ("latin-8") file encoding :param key: key in artifact store (when log_data=True) :param dataset: (None) if not None then "target_path/dataset" is folder for partitioned files :param part_cols: ([]) list of partitioning columns :param file_ext: (parquet) csv/parquet file extension :param index: (False) pandas save index option :param refresh_data: (False) overwrite existing data at that location :param stats: (None) calculate table stats when logging artifact """ base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) archive_url = archive_url.local() if dataset is not None: dest_path = os.path.join(base_path, dataset) exists = os.path.isdir(dest_path) else: dest_path = os.path.join(base_path, key + f".{file_ext}") exists = os.path.isfile(dest_path) if not exists: context.logger.info("destination file does not exist, downloading") if chunksize > 0: header = _chunk_readwrite(archive_url, dest_path, chunksize, encoding, dtype, dataset) context.log_dataset(key=key, stats=stats, format='parquet', target_path=dest_path) else: df = pd.read_csv(archive_url) context.log_dataset(key, df=df, format=file_ext, index=index) else: context.logger.info("destination file already exists, nothing done")